#
#     Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

include rcfiles/cudaselectrc;

variable DETECTCUDA is default($or($ANYCUF
         ,$ANYCU,$contains($ACCELS,tesla),$CUDALIBNEEDED));
variable CUDAVERSIONDEFINE is default($if($or($CUDARTNEEDED,$CUDALIBNEEDED,$contains($ACCELS,tesla)),CUDA_VERSION=$CUDAXXYY));
variable ACCDEFINES is default($if($or($TA,$LNGACC),_ACCEL=201003 _OPENACC=201711));
variable NEEDCUDA80 is default();
variable NEEDCUDA90 is default();
variable NEEDCUDA100 is default();
variable NEEDCUDA110 is default();
variable NEEDCUDA111 is default();
variable NEEDCUDA114 is default();
variable NEEDCUDA118 is default();
variable NEEDCUDA127 is default();
variable NEEDCUDA128 is default();
variable DEFTESLAONLY is default($if($contains($SYSACCELS,tesla),1,0));
variable ACCTESLAONLY is default();
variable DEFMULTIONLY is default($equal($ACCELS,multicore));
variable ACCMULTIONLY is default();
variable ACCMULTI is default($contains($ACCELS,multicore));
# Indicates compilation for OpenACC multicore.
variable ACCMC is default($and($or($TA,$LNGACC),$ACCMULTI));
# Indicates compilation for OpenACC GPU.
variable ACCGPU is default($and($or($TA,$LNGACC),$contains($ACCELS,tesla)));
# Indicates compilation for OpenACC host.
variable ACCHOST is default($and($or($TA,$LNGACC),$contains($ACCELS,host)));
variable ACCRPATH is default();
variable MULTICORETRACE is default(1);
variable PADDFLAG is default(-x 186 0x80);
variable DEVDEBUG is default(0);
variable VERYVERBOSE is default();

# This variable is set to one when compilation for GPU is requested, which means -acc, -acc=gpu, -mp=gpu, -stdpar, -stdpar=gpu, -cuda, .cu, .cuf
variable ISCOMPFORGPU is default($or($TA,$land($LNGACC,$expr($TGTACC & $TGTGPU)),$land($LNGOMP,$expr($TGTOMP & $TGTGPU)),$land($LNGACC,$expr($TGTACC & $TGTGPU)),$CUDA,$ANYCU,$ANYCUF));

error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<8000),$NEEDCUDA80),CUDA 8.0 or later required));
error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<9000),$NEEDCUDA90),CUDA 9.0 or later required));
error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<10000),$NEEDCUDA100),CUDA 10.0 or later required));
error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<11000),$NEEDCUDA110),CUDA 11.0 or later required));
error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<11010),$NEEDCUDA111),CUDA 11.1 or later required));
error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<11040),$NEEDCUDA114),CUDA 11.4 or later required));
error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<11080),$NEEDCUDA118),CUDA 11.8 or later required));
error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<12070),$NEEDCUDA127),CUDA 12.7 or later required));
error($if($and($ISCOMPFORGPU,$expr($CUDAXXYY<12080),$NEEDCUDA128),CUDA 12.8 or later required));

variable DETECTEDCAPS is default($if($DETECTCUDA,$ifn($SETCAPS,$action(cudacc()))));
variable SETCAPS is default(0);

# Default compute capabilities per programming languages: OpenMP cc70, STDPAR cc60, OpenACC cc50 (like nvcc)
variable PLDEFAULTCOMPUTECAPS is default($if($land($LNGOMP,$expr($TGTOMP & $TGTGPU)),70,
                                       $if($land($or($land($LNGSPA,$expr($TGTSPA & $TGTGPU),$equal($DRIVERLANG,CPP)),$land($TGTCUDA,$equal($DRIVERLANG,CPP))),$not($land($LNGACC,$expr($TGTACC & $TGTGPU)))),60)
                                       50));

# When compiling for GPU, emit warning when using `-g` and compiling
# multiple compute capabilities that include cc90 or older (NVVM70)
# and cc100+ (NVVM SOLID). Mixing NVVM70 and NVVM SOLID is not
# supported with `-g` because they have incompatible metadata.  This
# can happen only if we can actually enable compilation for cc100,
# which is possible only with CUDA 12.7 or newer.
warning($if($land($equal($DRIVERLANG,Fortran),$DETECTCUDA,$DEBUGFLAG,$expr($CUDAXXYY>=12070),
            $lor(
                $land($SETCAPS,$ISPREBLACKWELL,$ISBLACKWELLPLUS), # multiple cc are being requested with `-gpu=ccX.Y` and "cc100 or cc101 or cc120" is there, e.g. -gpu=cc90,cc100
                $CCALL, # `-g -gpu=ccall`
                $CCALLMAJOR # `-g -gpu=ccall-major`
            )),Debug information for device code is not yet available when combining Blackwell (cc100+) with earlier compute capabilities; the compiler has added '-gpu=nodebug' to allow your compilation to succeed.));

warning($if($land($equal($DRIVERLANG,Fortran),$DETECTCUDA,$DEBUGFLAG,$expr($CUDAXXYY>=12070),$equal($DETECTEDCAPS,),$not($SETCAPS)), # compiling with -g on a machine with no GPU (compilation for all supported CC)
        No gpu detected\, compiling for all supported compute capabilities. Debug information for device code is not yet available when combining Blackwell (cc100+) with earlier compute capabilities; the compiler has added '-gpu=nodebug' to allow your compilation to succeed.));

variable DEFCOMPUTECAP is default(
    $if($notequal($DETECTEDCAPS,),$DETECTEDCAPS,
    $if($ISCCNATIVE,$PLDEFAULTCOMPUTECAPS,
    $if($and($contains($SYSCAP,35),$expr($CUDAXXYY<12000)),35)
    $if($contains($SYSCAP,50),50)
    $if($land($contains($SYSCAP,60),$expr($CUDAXXYY>=8000)),60)
    $if($land($contains($SYSCAP,60),$expr($CUDAXXYY>=8000)),61)
    $if($land($contains($SYSCAP,62),$expr($CUDAXXYY>=10000)),62)
    $if($land($contains($SYSCAP,70),$expr($CUDAXXYY>=9000)),70)
    $if($land($contains($SYSCAP,72),$expr($CUDAXXYY>=10000)),72)
    $if($land($contains($SYSCAP,75),$expr($CUDAXXYY>=10000)),75)
    $if($land($contains($SYSCAP,80),$expr($CUDAXXYY>=11000)),80)
    $if($land($contains($SYSCAP,86),$expr($CUDAXXYY>=11010)),86)
    $if($land($contains($SYSCAP,87),$expr($CUDAXXYY>=11040)),87)
    $if($land($contains($SYSCAP,89),$expr($CUDAXXYY>=11080)),89)
    $if($land($contains($SYSCAP,90),$expr($CUDAXXYY>=11080)),90)
    $if($land($contains($SYSCAP,100),$expr($CUDAXXYY>=12070)),100)
    $if($land($contains($SYSCAP,101),$expr($CUDAXXYY>=12080)),101)
    $if($land($contains($SYSCAP,120),$expr($CUDAXXYY>=12080)),120)
    )));
variable DEFCOMPUTECAPS is default($DEFCOMPUTECAP);
variable COMPUTECAP is default($DEFCOMPUTECAPS);    # default may be set by the user
variable COMPUTECAPS is default($if($land($LNGOMP,$expr($TGTOMP & $TGTGPU),$notequal($OMPCOMPUTECAPS,)),$OMPCOMPUTECAPS,$if($land($or($land($LNGSPA,$expr($TGTSPA & $TGTGPU),$equal($DRIVERLANG,CPP)),$land($TGTCUDA,$equal($DRIVERLANG,CPP))),$not($land($LNGACC,$expr($TGTACC & $TGTGPU)))),$SPACOMPUTECAPS,$COMPUTECAP)));
variable CCNEXTCOUNT is default(0);

# default value is empty; this is set by -acc=[no]required
variable ACCREQUIRED is default();
variable ACCBUILDLIB is default();

# Whether or not OpenACC routine parallelism should be inferred; this is set by -acc=[no]routinepar
variable ACCROUTINEPAR is default();

# Suppress GPU driver warnings
variable NV_SUPPRESS_GPU_FLAGS_WARNINGS is default(0) environment(NVCOMPILER_SUPPRESS_GPU_FLAGS_WARNINGS);

# OpenACC Stub Library
variable NEEDACCSTUBLIB is default(0);
variable ACCSTUBLIB is default($if($NEEDACCSTUBLIB,-laccstub$if($ISSTATIC,_static)));

variable DEFHOST is default($ifn($or($ANYCUF,$ANYCU),host));
# To change the -acc default from -acc=gpu,host to to
# -acc=gpu,multicore change the DEFHOST default value from 'host' to
# 'multicore' above
variable DEFACCEL is default($if($contains($SYSACCELS,tesla),tesla) $DEFHOST);
variable DDEFACCEL is default();
variable ACCELS is default($if($suffixused(cu,cup,cuf,CUF),tesla,$DDEFACCEL));
variable AOPT is default();
variable ACOPT is default();

variable MANAGED is default(0);
variable AUTOCOMPARE is default(0);
variable REDUNDANT is default(0);
variable F901MANPIN is default($DEFF901MANPIN);
variable DEFF901MANPIN is default();
variable CPP1MANPIN is default($DEFCPP1MANPIN);
variable DEFCPP1MANPIN is default();
variable CGMANPIN is default($DEFCGMANPIN);
variable DEFCGMANPIN is default();
variable MAYNEEDMANAGEDMEMORY is default(0);
variable NEEDMANAGEDMEMORY is default(0);
variable DEFNEEDMANPINMEMORYINTERCEPTION is default(0);
variable NEEDMANPINMEMORYINTERCEPTION is default($DEFNEEDMANPINMEMORYINTERCEPTION);
variable MANPINMEMORYINTERCEPTION is default(
    $PGISTATICX $if($notequal($PGISTATICX,),--whole-archive)
    $if($and($NVMALLOCNEEDED, $notequal($PGISTATICX,)),
        $(LIBSW)nvhpcmanaux_nvmalloc,
        $(LIBSW)nvhpcmanaux)
    $if($notequal($PGISTATICX,),--no-whole-archive));
variable MANCUDALIB is default(
    $if($NEEDMANAGEDMEMORY,
        $PGISTATICX $(LIBSW)nvhpcman $PGIUNSTATICX,
        $if($MAYNEEDMANAGEDMEMORY, $PGISTATICX $ASNEEDED $(LIBSW)nvhpcman $NOASNEEDED $PGIUNSTATICX)));
variable NEEDACCLIB is default(0);
variable TIMEINIT is default(0);
variable ZEROINIT is default(0);
variable PININIT is default(0);
variable MANINIT is default(0);
variable EXTRAINIT is default();
variable MANPREFER is default();
variable CUDA_NOATTACH is default(0);
variable NORDC is default(0);
variable CUDARTSTATIC is default($if($notequal($PGISTATICX,),_static));
variable CUDALIBSTATIC is default($if($index($TARGET,linux86-64,linuxpower,linuxarm64),$if($notequal($PGISTATICX,),_static)));
variable CULIBOSSTATIC is default(
    $if($notequal($PGISTATICX,),$(LIBSW)culibos $(LIBSW)cublas$(CUDARTSTATIC) $if($expr($CUDAXXYY>10000),$(LIBSW)cublasLt$(CUDARTSTATIC)) $(LIBSW)curand$(CUDARTSTATIC) $(LIBSW)cudaforwrapblas $if($expr($CUDAXXYY>11060),$(LIBSW)cudaforwrapblas117)));
variable CUBLASSTDLIB is default($if($and($expr($index($TARGET,linux86-64,linuxpower,linuxarm64,win64,win64-llvm) > 0),$expr($CUDAXXYY>=9020)),stdc++,c));

# The CFI acc lib should only be linked if using nvflang - and checking for USEFLANG1 is
# means of checking this aspect. Note that this just sets the string name - the inclusion
# of this is still only appended if -acc is used.
variable CFIACCLIB is default($if($USEFLANG1,-lacchostcfi));

include rcfiles/acc$PGSYS$(PGLEN)rc;
variable ACCPREFIX is default();

variable ACCLIBSUFF is default($if($land($index($TARGET,linux86-64),$STATICLIB),s));

# what accelerators we support on this target, now only Tesla
variable SYSACCELS is default( tesla );

# if -acc with gpu target (-acc=gpu or -acc -target=gpu)
variable ACCGPULINK is default($land($LNGACC,$expr($TGTACC & $TGTGPU))) help(Set if compiling OpenACC for GPU);
# if -mp and gpu target (-mp=gpu or -mp -target=gpu)
variable MPGPULINK is default($land($LNGOMP,$expr($TGTOMP & $TGTGPU))) help(Set if compiling OpenMP for GPU);

# Set to one when -no-default-cuda is used (dev-only)
variable USENODEFAULTCUDA is default(0);

# now the same targets on all platforms
variable SYSACCOBJS is default(
	$if($contains($ACCELS,tesla),cuda)
	$if($contains($ACCELS,multicore),multicore)
	$if($contains($ACCELS,host),host)
	$if($MPGPULINK,mp)
	$if($ACCGPULINK,acc)
	);
variable ACCCUDAVSNSUFF is default($if($expr($CUDAXXYY<11030),110,
                                   $if($expr($CUDAXXYY<11080),113,
                                   $if($expr($CUDAXXYY<12070),118))));
# The else case is to allow using `-gpu=managed` with `nvfortran -cuda`.
variable SYSACCLIBS is default($if($or($contains($ACCELS,tesla),$ISCUDALIB),devaux$ACCCUDAVSNSUFF device$ACCLIBSUFF,$if($land($CUDARTNEEDED,$equal($DRIVERLANG,Fortran),$NEEDMANAGEDMEMORY),devaux$ACCCUDAVSNSUFF device$ACCLIBSUFF)));

variable ACCLIB is default($if($NEEDACCLIB,
    $foreach(oo,$SYSACCOBJS, $lookup($COMPLIBOBJ,acc_init_link_$oo.$OBJSUFFIX))
    $if($land($contains($ACCELS,tesla),$CUDARTNEEDED,$NORDC),$lookup($COMPLIBOBJ,acc_init_set_cuda.$OBJSUFFIX))
    $if($and($NEEDUNIFIED,$NORDC),$lookup($COMPLIBOBJ,acc_init_set_unified.$OBJSUFFIX))
    $PGISTATIC
    $ifn($USEOTHEROMPLIB,$if($ISSTATIC,$OMPINTEROPSTART))
    $foreach(ll,
      $if($notequal($PGISTATICX,), $SYSACCLIBS) host$ACCLIBSUFF $SYSACCLIBS, 
      $(LIBSW)$(ACCPREFIX)acc$(ll)
    )
    $CFIACCLIB
    $ifn($USEOTHEROMPLIB,$if($ISSTATIC,$OMPINTEROPEND))
    $PGIUNSTATIC));

variable ACCLIB2 is default($if($NEEDACCLIB,
    $ifn($STATICLIB,$PGIUNSTATICX $DLLIB)
    $if($and($not($STATICLIB),$or($NEEDCUDANVSHMEM,$NEEDCUDACUSPARSE)),$(LIBSW_NOLIB)cuda)
    $if($index($TARGET,win64,win64-llvm),-defaultlib:ws2_32.lib))
    $if($NEEDUNIFIED, $lookup($COMPLIBOBJ,__gpu_unified_compiled.$OBJSUFFIX))
    $if($or($CUDARTNEEDED,$land($contains($ACCELS,tesla,multicore),$NEEDACCLIB)),$PGISTATICX $(LIBSW)cudadevice $PGIUNSTATICX));

variable CUDAINC is default($if($land($not($NVCCHOST),$lor($CUDARTNEEDED,$CUDALIBNEEDED,$contains($ACCELS,tesla))),
                                $if($or($TGTCUDA,$ISCUDALIB),
                                    $if($notequal($CUDAMATHINCDIR,),
                                        $if($NEEDCUFFTMP,$path($CUDAMATHINCDIR/cufftmp))
                                        $if($notequal($CUDACUPTIINCDIR,),$path($CUDACUPTIINCDIR))
                                        $path($CUDAMATHINCDIR))
                                    $if($notequal($COMMLIBSINCDIR,),$foreach(i,$COMMLIBSINCDIR,$path($i) )))
                                $if($notequal($USECUDAROOT,),$path($USECUDAROOT/include))));

set NOBOUNDSCHECK=$if($or($contains($ACCELS,tesla),$FNEEDCUDA),1,0);
warning($if($and($NOBOUNDSCHECK,$BOUNDSCHECK),CUDA Fortran or OpenACC GPU targets disables -Mbounds));

# append USRINC=$CUDAINC;

variable ACCELFLAG1 is default($foreach(a,$ACCELS,-accel $a ));
variable ACCELFLAG is default($ACCELFLAG1 $if($notequal($AOPT,),-aopt $AOPT) $if($notequal($ACOPT,),-x 63 $expr($ACOPT + 1)));
variable ACCAUTOPAR is default(1);

variable ACCLINK is default($if($DOACCLINK,$tool(acclink) $if($ISSTATIC,-static-nvidia) $NVILINKARGS $INITARGS $if($VERYVERBOSE,-vv,)));

# Partial Link Support (when we decide to make partial link the
# default behavior we should remove the line below and uncomment the
# next line)
variable NEEDPARTIALLINK is default(0);
# If partial link is enabled by default it still should be disabled
# when creatng shared object
# variable NEEDPARTIALLINK is default($if($DYNAMICLINK,0,1));

# FPIFP Link support
variable NEEDFPIFP is default(0);

variable NKEEP is default(0);

# `tesla` should be added to ACCELS when compiling CUF/CU programs
append ACCELS=$ifn($contains($ACCELS,tesla),$if($lor($ANYCUF,$ANYCU),tesla));

# Support for exposing specific symbols to dlsym in statically linked applications
variable NVHPCLDSYMSPATH is default($lookup($COMPBASE/$COMPSYS/$COMPVER/$(COMPLIBPREFIX)lib,nvhpc.syms));
variable NVHPCLDSYMS is default($if($NVHPCLDSYMSPATH,--dynamic-list=$NVHPCLDSYMSPATH));
variable NEEDNVHPCLDSYMS is default(0);

# This variable is set to 1 when `-cudaforlibs` is used, it helps
# enabling fortran wrappers when used with `-cudalib` and we are not
# linking with `nvfortran`
variable ISCUDAFORLIBS is default();

# pgnvd is used to compile the gpu code generated by the accelerator compiler backend.
# The invocation of pgnvd is done from functions 'compile_cuda_kernel'/'acc_compile'
# and its options are set up by 'cuda_build_compile_options'.
# pgnvd is also invoked by pgacclnk as part of the accelerator link step.
variable NVDD is default(pgnvd);
tool pgnvd is
    program($NVDD) directory($CCOMPDIR);

variable ACCLNK is default(pgacclnk);
tool acclink is    program($ACCLNK) directory($CCOMPDIR);

switch -Wnvvm,arg is
	help(Pass argument to nvvm)
	helpgroup(target)
	# pass to the backend, which forwards to nvdd, which forwards to nvvm
	append(ACCCGFLAGS=-wnvvm ,$arg)
	;

switch -Wptxas,arg is
	help(Specify options directly to ptxas, the PTX optimizing assembler.)
	helpgroup(target)
	# pass to the backend, which forwards to nvdd, which forwards to ptxas
	append(ACCCGFLAGS=-wptxas ,$arg)
	;

switch -Wfatbinary,arg is
	help(Pass argument to fatbinary)
	helpgroup(target)
	# pass to the backend, which forwards to nvdd, which forwards to fatbinary
	append(ACCCGFLAGS=-wfatbinary ,$arg)
	;

variable EXTRAACCLNK is default();
switch -Wacclnk,arg is
	hide
	help(Pass argument to acclnk)
	helpgroup(target)
	append(EXTRAACCLNK=$replace($arg,",", ))
	;

switch -Wnvlink,arg is
	help(Specify options directly to nvlink, the device linker.)
	helpgroup(linker)
	# pass to acclnk, which forwards on to nvlink
	append(EXTRAACCLNK=-Wnvlink,$arg)
	;

switch -Wimport,arg is
	hide
	help(Pass argument to import)
	helpgroup(target)
	# pass to acclnk, which forwards on to import
	append(EXTRAACCLNK=-Wimport,$arg)
	;

variable NVILINKARGS is default(
    $if($or($contains($ACCELS,tesla),$FNEEDCUDA,$CNEEDCUDA),
        $if($DEVDEBUG, -debug)
        -nvidia $tool(pgnvd) -cuda$CUDAXXYY
        $if($notequal($USECUDAROOT,),-cudaroot $USECUDAROOT)
        $if($or($FNEEDCUDA,$CNEEDCUDA),-cudalink)
        $if($NEEDUNIFIED,-unifiedmem)
        $if($ACCRELOC,$foreach(cc,$COMPUTECAPS, $ifn($equal($cc,next),-computecap=$cc)))
        $if($DYNAMICLINK,-dyninit)
        $if($NORDC,-nordc)
        $if($GGPUFILE,-gpufile)
        $if($NKEEP,-keep)
        $NVVMFLAGS
        $if($NEEDPARTIALLINK,-partiallink)
        $if($NEEDCUDALTO,-lto $if($expr($TGTOMP & $TGTGPU),-maxrregcount 128))
        $if($NEEDFPIFP,-fpifp)
        $if($USENODEFAULTCUDA,-nodefaultcuda)
        $EXTRAACCLNK
        $if($NEEDACCLIB,
            $if($contains($ACCELS,tesla),-init=ctxrel)
            $if($CUDA_NOATTACH,-init=noattach)
            $if($ZEROINIT,-init=zeroinit)
            $if($PININIT,-init=pinned)
            $if($MANINIT,-init=managed $if($notequal($MANPREFER,),-init=managed_prefer$MANPREFER))
            $if($and($CUDARTNEEDED,$contains($ACCELS,tesla)),-init=cuda)
            $EXTRAINIT
        )
    )
);

variable INITARGS is default($if($TIMEINIT,-init=tatime));
variable DOACCLINK is default($if($or($ACCRELOC,$NEEDACCLIB),1,0));
variable ADDCUDA is default($or($contains($ACCELS,tesla),$FNEEDCUDA,$CNEEDCUDA));

set IPANOARG=$contains($ACCELS,tesla);

variable MINBLKSPERSM is default(0);
variable MAXREGCOUNT is default(0);
variable MAXNVVMTHREADS is default(0xFFFF);
variable DEFVECTLEN is default(0);
variable DEFWORKERS is default(0);
variable STACKLIMIT is default(512);

append CPP1ARGS=$if($ACCROUTINEPAR, --accel_routinepar);

append CGARGS=$ACCELFLAG $ACCCGFLAGS $ACCCGDEF $ACCDEBUG $LLVMFLAGS
              $if($ACCAUTOPAR,-y 189 0x4000000,-x 189 0x4000000)
              $select($ACCREQUIRED,1,-x 180 6,0,-x 180 2)
              $if($ACCBUILDLIB,-x 192 0x2000000)
              $if($ADDCUDA,$if($notequal($USECUDAROOT,),-cudaroot $USECUDAROOT))
              $NVVMXBITS
              $if($equal($STACKLIMIT,),-x 205 0x1000,-x 60 $STACKLIMIT);

append F901ARGS=$ACCFEFLAGS $ACCCGDEF $ACCDEBUG $LLVMFLAGS $ACCELFLAG1;
append USRDDEF=$ACCDEF $MPDEFCPP $if($CUDAVERSIONDEFINE, -D$CUDAVERSIONDEFINE);
append USRDEFDEF=$ACCDEFDEF $if($CUDAVERSIONDEFINE, -def $CUDAVERSIONDEFINE);

variable ACCDEF is default();
variable ACCDEFDEF is default();

variable DEFAULTCAPFLAG is default($if($ADDCUDA,$if($notequal($COMPUTECAPS,),-x 176 0x100
            $foreach(c,$COMPUTECAPS,-cudacap $c ))));

variable CUDALIBNEEDED is default(0);

# -cudalib
variable NEEDCUDAALL is default(0);
set LDLIBARGS=$if($lor($CUDARTNEEDED,$CUDALIBNEEDED),
                  $if($or($TGTCUDA,$ISCUDALIB),
                      $if($notequal($CUDAMATHLIBDIR,),$(LDIRSW)$CUDAMATHLIBDIR)
                      $if($notequal($COMMLIBSLPATHDIR,),$COMMLIBSLPATHDIR))
                  $if($NEEDCUPTI,$if($notequal($CUDACUPTILIBDIR,),$(LDIRSW)$CUDACUPTILIBDIR))
                  $if($notequal($CUDALIBDIR,),$(LDIRSW)$CUDALIBDIR));
# Math Libs

# If we are using NVHPC_CUDA_HOME, we need to use the CUDA default directory
# structure, otherwise we need to use the shipped directory structure.
variable DEFAULT_CUDAMATHDIR is default($if($equal($USER_SET_CUDA_HOME,1), $USECUDAROOT, $COMPBASE/$COMPSYS/$COMPVER/math_libs/$CUDAVERSION));
variable NVCOMPILER_MATH_LIBS_HOME is default($DEFAULT_CUDAMATHDIR) environment(NVCOMPILER_MATH_LIBS_HOME);
variable CUDAMATHDIR is default($if($equal($PFX,nv),$if($CUDAVERSION, $NVCOMPILER_MATH_LIBS_HOME)));
variable CUDAMATHINCDIR is default($if($CUDAMATHDIR,$CUDAMATHDIR/include));
variable CUDAMATHBINDIR is default($if($CUDAMATHDIR,$CUDAMATHDIR/bin));
variable CUDAMATHLIBDIR is default($if($CUDAMATHDIR,$if($index($TARGET,win64,win64-llvm),$CUDAMATHDIR/lib,$CUDAMATHDIR/lib64)));

# CUPTI
variable DEFAULT_CUDACUPTIDIR is default($if($equal($USER_SET_CUDA_HOME,1), $USECUDAROOT, $COMPBASE/$COMPSYS/$COMPVER/cuda/$CUDAVERSION/extras/CUPTI));
variable NVCOMPILER_CUPTI_LIBS_HOME is default($DEFAULT_CUDACUPTIDIR) environment(NVCOMPILER_CUPTI_LIBS_HOME);
variable CUDACUPTIDIR is default($if($equal($PFX,nv),$if($CUDAVERSION, $NVCOMPILER_CUPTI_LIBS_HOME)));
variable CUDACUPTIINCDIR is default($if($CUDACUPTIDIR,$CUDACUPTIDIR/include));
variable CUDACUPTILIBDIR is default($if($CUDACUPTIDIR,$CUDACUPTIDIR/lib64));

# CUBLAS
variable NEEDCUBLAS is default(0);
variable NEEDCUDACUBLAS is default($or($NEEDCUDAALL,$NEEDCUBLAS));

# CUBLASMP
variable NEEDCUBLASMP is default(0);
variable NEEDCUDACUBLASMP is default($or($ifn($index($TARGET,linuxpower),$NEEDCUDAALL,0),$NEEDCUBLASMP));

# CUFFT
variable NEEDCUFFT is default(0);
variable NEEDCUFFTCALLBACK is default(0);
variable NEEDCUDACUFFT is default($or($NEEDCUDAALL,$NEEDCUFFT));

# CUFFTW
variable NEEDCUFFTW is default(0);
variable NEEDCUDACUFFTW is default($or($NEEDCUDAALL,$NEEDCUFFTW));

# CUFFTMP
variable NEEDCUFFTMP is default(0);
# cuFFT and cuFFTMP are not compatible together, for now when `-cudalib` is used only link cufft
# variable NEEDCUDACUFFTMP is default($or($NEEDCUDAALL,$NEEDCUFFTMP));
variable NEEDCUDACUFFTMP is default($NEEDCUFFTMP);

# CURAND
variable NEEDCURAND is default(0);
variable NEEDCUDACURAND is default($or($NEEDCUDAALL,$NEEDCURAND));

# CUSOLVER
variable NEEDCUSOLVER is default(0);
variable NEEDCUDACUSOLVER is default($or($NEEDCUDAALL,$NEEDCUSOLVER));

# CUSOLVERMP
variable NEEDCUSOLVERMP is default(0);
variable NEEDCUDACUSOLVERMP is default($or($NEEDCUDAALL,$NEEDCUSOLVERMP));

# CUSPARSE
variable NEEDCUSPARSE is default(0);
variable NEEDCUDACUSPARSE is default($or($NEEDCUDAALL,$NEEDCUSPARSE));

# CUPTI
variable NEEDCUPTI is default(0);

# CUTENSOR
variable NEEDCUTENSOR is default(0);
variable NEEDCUDACUTENSOR is default($or($NEEDCUDAALL,$NEEDCUTENSOR));

# NVBLAS
variable NEEDNVBLAS is default(0);
variable NEEDCUDANVBLAS is default($or($NEEDCUDAALL,$NEEDNVBLAS));

# NVLAMATH
variable NEEDNVLAMATH is default(0);
variable NEEDCUDANVLAMATH is default($or($NEEDCUDAALL,$NEEDNVLAMATH));
variable NVLAMATH_SFX is default("");

# NVTX3
variable NEEDNVTX3 is default(0);
variable NEEDCUDANVTX3 is default($or($NEEDCUDAALL,$NEEDNVTX3));

variable NEEDCUDAMATHPATHS is default($or($NEEDCUDACUBLAS,$NEEDCUDACUBLASMP,$NEEDCUDACUFFT,$NEEDCUDACUFFTW,$NEEDCUFFTMP,$NEEDCUDACURAND,$NEEDCUDACUSOLVER,$NEEDCUDACUSOLVERMP,$NEEDCUDACUSPARSE,$NEEDCUDACUTENSOR,$NEEDCUDANVBLAS,$NEEDCUDANVLAMATH,$NEEDCUDANVTX3));

# Math Libs

# NCCL Path
variable DEFAULT_CUDANCCLDIR is default($if($COMMLIBSDIR,$COMMLIBSDIR/nccl));
variable NVCOMPILER_NCCL_HOME is default($DEFAULT_CUDANCCLDIR) environment(NVCOMPILER_NCCL_HOME);
variable CUDANCCLDIR is default($NVCOMPILER_NCCL_HOME);
variable CUDANCCLINCDIR is default($if($CUDANCCLDIR,$CUDANCCLDIR/include));
variable CUDANCCLLIBDIR is default($if($CUDANCCLDIR,$CUDANCCLDIR/lib));
variable NEEDNCCL is default(0);
variable NEEDCUDANCCL is default($or($NEEDCUDAALL,$NEEDNCCL));

# NVSHMEM Path
variable DEFAULT_CUDANVSHMEMDIR is default($if($COMMLIBSDIR,$COMMLIBSDIR/nvshmem));
variable NVCOMPILER_SHMEM_HOME is default($DEFAULT_CUDANVSHMEMDIR) environment(NVCOMPILER_SHMEM_HOME);
variable CUDANVSHMEMDIR is default($NVCOMPILER_SHMEM_HOME);
variable CUDANVSHMEMINCDIR is default($CUDANVSHMEMDIR/include);
variable CUDANVSHMEMLIBDIR is default($CUDANVSHMEMDIR/lib);
variable NEEDNVSHMEM is default(0);
variable NEEDCUDANVSHMEM is default($or($if($land($index($TARGET,linuxarm64),$expr($CUDAXXYY<12020)),0,$NEEDCUDAALL),$NEEDNVSHMEM,$NEEDCUFFTMP,$if($expr($CUDAXXYY>=12000),$NEEDCUBLASMP)));

# Comm Libs Path
variable COMMLIBSINCDIR is default($if($NEEDCUDANCCL,$path($CUDANCCLINCDIR)) $if($NEEDCUDANVSHMEM,$path($CUDANVSHMEMINCDIR)));
variable COMMLIBSRPATHDIR is default($if($NEEDCUDANCCL,-rpath $CUDANCCLLIBDIR) $if($NEEDCUDANVSHMEM,-rpath $CUDANVSHMEMLIBDIR) $if($lor($NEEDCUDACUBLASMP,$NEEDCUDACUSOLVERMP),-rpath $HPCXUCCLIBDIR -rpath $HPCXUCXLIBDIR));
variable COMMLIBSLPATHDIR is default($if($NEEDCUDANCCL,$(LDIRSW)$CUDANCCLLIBDIR) $if($NEEDCUDANVSHMEM,$(LDIRSW)$CUDANVSHMEMLIBDIR));
variable COMMLIBSLDDIR is default($if($NEEDCUDANCCL,$(LPRE)$CUDANCCLLIBDIR) $if($NEEDCUDANVSHMEM,$(LPRE)$CUDANVSHMEMLIBDIR));
variable NEEDCUDACOMMPATHS is default($or($NEEDCUDANCCL,$NEEDCUDANVSHMEM));
# Comm Libs

# If at least one cuda-optimized library is needed
variable NEEDCUDALIB is default(or($NEEDCUDACUBLAS,$NEEDCUDACUFFT,$NEEDCUDACUFFTW,$NEEDCUDACURAND,$NEEDCUDACUSOLVER,$NEEDCUDACUSPARSE,$NEEDCUDACUTENSOR,$NEEDCUDANCCL,$NEEDCUDANVSHMEM,$NEEDNVLAMATH));
variable LDLIBSLIST is default();
variable MATHCUDALIBLIST is default();
variable ASNEEDED is default(--as-needed);
variable NOASNEEDED is default(--no-as-needed);
# Use linker option `--as-needed/--no-as-needed` when `-cudalib` is used with no sub-option
variable CUDAASNEEDED is default($if($NEEDCUDAALL,$ASNEEDED));
variable CUDAASNONEEDED is default($if($NEEDCUDAALL,$NOASNEEDED));
variable LIBCULIBOS is default($if($ISSTATIC,-lculibos));
variable ISCUDALIB is default();
# -cudalib

variable TOOLKITFLAG is default($if($ADDCUDA,-cudaver $CUDAXXYY));

variable CHECKCUDALIB is default($if($or($ANYCUF,$ANYCU,$contains($ACCELS,tesla)),1,0));
variable IGNORECUDALIB is default();

variable ACCFLAGS is default();
variable DEF901ACC is default($ACC901FLAGS);
variable ACC901FLAGS is default();
variable ACCFEFLAGS is default();
variable ACCCGFLAGS is default();
append ACCCGFLAGS= -x 197 $MINBLKSPERSM -x 175 $MAXREGCOUNT -x 203 $DEFVECTLEN -x 204 $DEFWORKERS -x 227 $MAXNVVMTHREADS;
variable IMPLICITSECTIONS is default(0);
variable ACCCGDEF is default(
		$if($ACCRELOC,-x 189 0x8000)	# generate CUDA RDC (relocatable device code)
		$select($ACCWAIT,0,-x 163 0x40000000 -y 163 0x80000000,1,-y 163 0xc0000000,2,-x 163 0x80000000)
			# 163 0 = default, host waits for kernel completion if not async
			# 163 0x40000000, host never waits for kernel completion, only waits for data
			# 163 0x80000000, host always waits for kernel completion, even if async
		$if($IMPLICITSECTIONS,-x 201 0xf0000000));	# enable implicit array element=>array section translation in OpenMP/OpenACC data clauses (old PGI behavior)
variable ACCWAIT is default(1);
variable ACCCACHE is default();
variable DEFACC is default($ACCFLAGS $DEFAULTCAPFLAG);
variable FORCEDBGLLVM is default(0);
variable FORCELILLVM is default(0);
variable DEFAULTLLVM is default($if($equal($TARGETARCH,64),1,0));
variable FORCELLVM is default($if($or($DEFAULTLLVM,$or($FORCELILLVM, $FORCEDBGLLVM)), 1, 0));
variable ACCMINDEBUG is default($if($DEBUGFLAG,-x 192 0x40000000));
variable DEFACCDEBUG is default($ifn($expr($CUDAXXYY=8000),$if($and($expr($OPTLEVELUSE < 1),$DEBUGFLAG),-x 163 0x800000,$ACCMINDEBUG)));
# Set `-gpu=nodebug` when Blackwell and pre-Blackwell compute capabilities are being mixed when using `-g` with nvfortran
variable ACCDEBUG is default($if($equal($FORCEDBGLLVM,1),-x 163 0x800000,$DEFACCDEBUG) $if($land($equal($DRIVERLANG,Fortran),$DEBUGFLAG,$ISPREBLACKWELL,$ISBLACKWELLPLUS),-y 163 0x800000 -y 192 0x40000000 -y 120 0x1000));

variable LLVMFLAGS is default($if($equal($FORCELLVM,1),-x 189 0x10));
variable DEFDEFDEF4ACCRELOC is default($if($or($suffixused(cuf,CUF),$suffixused(cu,cup),$contains($ACCELS,tesla)),1,0));
variable DEFDEFDEFACCRELOC is default($DEFDEFDEF4ACCRELOC);
variable DEFDEFACCRELOC is default($DEFDEFDEFACCRELOC);
variable DEFACCRELOC is default($DEFDEFACCRELOC);
variable ACCRELOC is default($DEFACCRELOC);    # default is now always link
variable ACCLLVM is default();
variable GGPUFILE is default(0);
variable DEBUGFLAG_64 is default($and($DEBUGFLAG,$equal($TARGET,linux86-64)));

# OpenACC Flags
# -acc={gpu|gpu,multicore|multicore}
#         gpu                     OpenACC directives compiled for GPU execution only
#         gpu,host                (default) OpenACC directives compiled for GPU (default) or multicore CPU execution (UNTIL WE IMPLEMENT GPU,MULTICORE)
#         gpu,multicore           (default) OpenACC directives compiled for GPU (default) or multicore CPU execution (NOT READY YET)
#         multicore               OpenACC directives compiled for multicore CPU execution only

switch -acc is
    help(Enable OpenACC directives)
    helpname(-acc)
    helpgroup(target)
    set(LNGACC=1)
    set(NEEDLOCSCRIPT=1)

    # Common settings
    set(nkey=0)
    set(DEFACC=)
    set(DEF901ACC=)
    append(ACCCGFLAGS=-x 180 0x4000400 -x 121 0xc00)
    # Support for exposing `acc_get_device_type` in statically linked application, needed by LIBCUPTI
    set(NEEDNVHPCLDSYMS=1)

    # ACCDEPRECATE PGI Accelerator Directives
    append(ACCCGFLAGS=$PADDFLAG)
    append(ACCFEFLAGS=$PADDFLAG)

    set(ACCDEF=$foreach(f,$ACCDEFINES, -D$f))
    set(ACCDEFDEF=$foreach(f,$ACCDEFINES, -def $f))
    set(NEEDACCLIB=1)
    append(CPP1ARGS=--accel --preinclude openacc_predef.h)
    fatal($ifn($contains($SYSACCELS,tesla),Target accelerator -acc=gpu is not supported for $PGSYS-$PGLEN systems))
    add(nkey=1)
    max(OPTLEVELINITDEF1=2)
    # Common settings

    # GPU
    set(ACCTESLAONLY=$DEFTESLAONLY)
    set(DEFDEFDEFACCRELOC=$if($expr($TGTACC & $TGTGPU),1,$DEFDEFDEF4ACCRELOC))

    append(OPTLEVELMINLIMIT=$if($expr($TGTACC & $TGTGPU)," -acc"))
    append(OPTLEVELMINLIMIT=$if($expr($TGTACC & $TGTGPU)," -ta=acc"))

    append(ACCFEFLAGS=$if($expr($TGTACC & $TGTGPU), -x 180 0x400 -x 163 0x1 $TOOLKITFLAG $ACCCACHE))
    append(ACCFEFLAGS=$if($expr($TGTACC & $TGTGPU), -x 186 0x80000 $DEF901ACC))

    append(ACCCGFLAGS=$if($expr($TGTACC & $TGTGPU),-x 180 0x4000400 $DEFACC $ACCCACHE -x 121 0xc00 -x 194 0x40000))
    append(ACCCGFLAGS=$if($expr($TGTACC & $TGTGPU),-x 163 0x1 -x 186 0x80000 $TOOLKITFLAG))

    append(ACCELS=$if($expr($TGTACC & $TGTGPU),tesla))
    set(CUDARPATHNEEDED=$if($ISCUDARTNEEDED,1,0))
    set(CHECKCUDALIB=$if($ISCUDARTNEEDED,1,0))

    set(DDEFACCEL=$if($expr($TGTACC & $TGTGPU),$DEFACCEL))
    set(ACCFLAGS=$if($expr($TGTACC & $TGTGPU),-x 163 0x1 $TOOLKITFLAG))
    set(ACC901FLAGS=$if($expr($TGTACC & $TGTGPU),-x 163 0x1 $TOOLKITFLAG))

    # Host
    append(ACCELS=$if($expr($TGTACC & $TGTSEQ),$if($and($not($or($ANYCUF,$ANYCU,$TGTCUDA)),$expr($TGTACC & $TGTGPU)),host)))
    # Host
    # GPU

    # Multicore
    set(ACCMULTI=$if($or($TACPU,$expr($TGTACC & $TGTCPU)),1))
    set(ACCMULTIONLY=$DEFMULTIONLY)
    set(DEFTESLAONLY=$if($or($TACPU,$expr($TGTACC & $TGTCPU)),0,1)) # Keep value at one if Multicore was not selected
    append(OPTLEVELMINLIMIT=$if($or($TACPU,$expr($TGTACC & $TGTCPU))," -ta=multicore"))
    append(ACCFEFLAGS=$if($or($TACPU,$expr($TGTACC & $TGTCPU)),-x 163 1 -x 186 0x80000 -x 180 0x400))
    append(ACCCGFLAGS=$if($or($TACPU,$expr($TGTACC & $TGTCPU)),-x 163 1 -x 186 0x80000 -x 180 0x400 -x 121 0xc00))
    # Multicore

    keyword(
        gpu(
            help(OpenACC directives are compiled for GPU execution only; please refer to -gpu for target specific options)
            set(TGLACC=1)
            set(TGLACCGPU=$TGTGPU)
            set(TGTACC=$expr($TGLACCCPU | $TGLACCGPU | $TGLACCSEQ))
        )
        host(
            set(TGLACC=1)
            set(TGLACCSEQ=$TGTSEQ)
            set(TGTACC=$expr($TGLACCCPU | $TGLACCGPU | $TGLACCSEQ))
            set(THISACCTARGET=host)
            set(NEEDACCLIB=1)
            add(nkey=1)
            help(Compile for serial execution on the host CPU)
            append(ACCELS=host)
            keyword(
                time(hide
                    help(Collect simple timing information for the host version)
                    set(TIMEINIT=1)
                    set(DOACCLINK=1)
                )
            )
        )
        multicore(
            help(Compile for parallel execution on the host CPU)
            set(TGLACC=1)
            set(TGLACCCPU=$TGTCPU)
            set(TGTACC=$expr($TGLACCCPU | $TGLACCGPU | $TGLACCSEQ))
            keyword(
                trace(hide
                    set(MULTICORETRACE=$if($expr($TGTACC & $TGTCPU),1))
                )
                notrace(hide
                    set(MULTICORETRACE=$if($expr($TGTACC & $TGTCPU),0))
                )
                guided(hide
                    help(Use guided loop scheduling)
                    error($if($expr($TGTACC & $TGTCPU),$ifn($equal($PGLLVMTARGET,yes), guided suboption for -acc=multicore may only be used with the PGI LLVM compilers)))
                    append(ACCCGFLAGS=$if($expr($TGTACC & $TGTCPU),-x 210 0x20))
                )
                dbg(hide
                    help(Use debug version of OpenMP RT library)
                    set(OMPLIBDEBUG=_debug)
                )
                libomp(hide
                    help(Link with LLVM OpenMP library)
                    set(OMPPRESTDINC=$COMPBASE/$COMPSYS/$COMPVER/$quote($COMPINCPREFIX)include/libomp)
                    set(COMPLIBMP=$ifn($USEOTHEROMPLIB,$foreach(dd,$COMPLIBSUBDIR,$if($isdir($COMPBASE/$COMPSYS/$COMPVER/$dd/mp),$COMPBASE/$COMPSYS/$COMPVER/$dd/mp )),$LIBOMPPATH))
                    set(STDRPATHMP=$ifn($USEOTHEROMPLIB,$foreach(dd,$COMPLIBSUBDIR,$if($isdir($COMPBASE/$COMPSYS/$COMPVER/$dd/mp),-rpath $COMPBASE/$COMPSYS/$COMPVER/$dd/mp )),$LIBOMPRPATH))
                    set(USEOTHEROMPLIB=1)
                )
            )
        )
        stub(hide
            help(Link in the OpenACC stub library)
            set(NEEDACCSTUBLIB=1)
        )
        autopar(
            helpname([no]autopar)
            help(Enable (default) or disable loop autoparallelization within acc parallel)
            set(ACCAUTOPAR=$if($expr($TGTACC & $TGTALL),1))
        )
        noautopar(hide
            help(Disable loop autoparallelization within acc parallel)
            set(ACCAUTOPAR=$if($expr($TGTACC & $TGTALL),0))
        )
        defnone(hide
            help(Implicit default(none) on all compute construct)
            append(ACCCGFLAGS=$if($expr($TGTACC & $TGTALL),-x 194 0x8000))
        )
        defpresent(hide
            help(Implicit default(present) on all compute construct)
            append(ACCCGFLAGS=$if($expr($TGTACC & $TGTALL),-x 194 0x10000000))
        )
        required(hide
            help(Issue compiler error if the compute regions fail to accelerate)
            helpname([no]required)
            set(ACCREQUIRED=$if($expr($TGTACC & $TGTALL),1))
        )
        norequired(hide
            help(Generate host code if the compute regions fail to accelerate)
        )
        routinepar(
            if($not($ISFTN))
            help(Infer parallelism level in implicit routines for the device)
            helpname([no]routinepar)
            set(ACCROUTINEPAR=$if($expr($TGTACC & $TGTALL),1))
        )
        noroutinepar(hide
            if($not($ISFTN))
            help(Do not infer parallelism level in implicit routines for the device)
            set(ACCROUTINEPAR=$if($expr($TGTACC & $TGTALL),0))
        )

        routineseq(
            help(Compile every routine for the device)
            helpname([no]routineseq)
            set(ACCBUILDLIB=$if($expr($TGTACC & $TGTALL),1))
        )
        noroutineseq(hide
            help(Do not compile every routine for the device)
        )
        scalar(hide
            help(Generate scalar code for all OpenACC compute constructs)
            append(ACCCGFLAGS=$if($expr($TGTACC & $TGTALL),-x 194 0x1000))
        )
        legacy(
            help(Suppress warnings about deprecated PGI accelerator directives)
            set(PADDFLAG=$if($expr($TGTACC & $TGTALL),))
        )
        strict(
            help(Issue warnings for non-OpenACC accelerator directives)
            append(ACCCGFLAGS=$if($expr($TGTACC & $TGTALL),-x 186 0x100000))
            append(ACCFEFLAGS=$if($expr($TGTACC & $TGTALL),-x 186 0x100000))
        )
        verystrict(
            help(Fail with an error for any non-OpenACC accelerator directive)
            append(ACCCGFLAGS=$if($expr($TGTACC & $TGTALL),-x 186 0x300000))
            append(ACCFEFLAGS=$if($expr($TGTACC & $TGTALL),-x 186 0x300000))
            append(CPP1ARGS=$if($expr($TGTACC & $TGTALL), --accel_verystrict))
        )
        task(hide
            help(Enable async clause on multicore)
            append(ACCCGFLAGS=$if($expr($TGTACC & $TGTALL),-x 210 0x10))
        )
        sync(
            help(Ignore async clauses)
            set(ACCWAIT=$if($expr($TGTACC & $TGTALL),2))
        )
        wait(
            helpname([no]wait)
            help(Wait for each device kernel to finish)
            set(ACCWAIT=$if($expr($TGTACC & $TGTALL),1))
        )
        nowait(hide
            help(Execute device kernels asynchronously)
            append(ACCWAIT=$if($expr($TGTACC & $TGTALL),0))
        )
        cache(hide
            set(ACCCACHE=$if($expr($TGTACC & $TGTALL),-x 163 0x100))
        )
        nocache(hide
            set(ACCCACHE=$if($expr($TGTACC & $TGTALL),-y 163 0x100))
        )
        autooffload(hide
            help(Automatically offload do concurrent and parallelized loops)
            append(ACCCGFLAGS=$if($expr($TGTACC & $TGTALL),-x 215 0x4000))
        )
        hpsums(hide
            help(Compute sum reductions in high precision)
            append(ACCCGFLAGS=$if($expr($TGTACC & $TGTALL),-x 215 0x80000))
        )
        noldscript(hide
            help(Disable OpenACC linker script)
            set(LOCSCRIPT=0)
        )
    )
    # GPU
    append(ACCCGFLAGS=$if($expr($TGTACC & $TGTGPU),$DEFAULTCAPFLAG))
    append(ACCCGFLAGS=$if($expr($TGTACC & $TGTGPU),$if($notequal($USECUDAROOT,),-cudaroot $USECUDAROOT)))
    # GPU
    # Multicore
    append(ACCELS=$if($or($TACPU,$expr($TGTACC & $TGTCPU)),multicore))
    append(ACCCGFLAGS=$if($or($TACPU,$expr($TGTACC & $TGTCPU)),$if($MULTICORETRACE, -x 210 8, -y 210 8)))
    set(ACCMULTI=$if($or($TACPU,$expr($TGTACC & $TGTCPU)),1))
    # Multicore
    # OpenMP libraries are always needed
    set(OMPLIB=$if($equal($PGLLVMTARGET,yes), $if($USEOTHEROMPLIB,$LIBOMP,$NVOMPLIBS)))
    nokeyword();

switch -noacc is
    help(Disable OpenACC directives and do not link with OpenACC libraries.)
    helpname(-noacc)
    helpgroup(target)

    # Disable both OpenACC host and device
    set(LNGACC=0)
    set(TGLACC=0)
    set(TGLACCGPU=0)
    set(TGLACCCPU=0)
    set(TGLACCSEQ=0)
    set(TGTACC=$expr($TGLACCCPU | $TGLACCGPU | $TGLACCSEQ));

variable CUDA_DRIVER_VERSION is default($if($DETECTCUDA,$action(cudadriver())));
variable CUDA_DRIVER_VERSION_STR is default($if($DETECTCUDA,$remove($CUDA_DRIVER_VERSION,.)));
variable CUDA_DRIVERS_SUPPORTED is default($if($DETECTCUDA,$if($expr($CUDA_DRIVER_VERSION_STR>110),11.0 or $CUDAVERSION,$CUDAVERSION)));

variable BLANKCUDALIB is default($and($equal($CUDALIBDIR,),$equal($CHECKCUDALIB,1),$equal($IGNORECUDALIB,)));
variable BADCUDALIB is default($and($notequal($CUDALIBDIR,),$not($isdir($CUDALIBDIR)),$equal($CHECKCUDALIB,1),$equal($IGNORECUDALIB,)));
error($if($BLANKCUDALIB,A CUDA toolkit matching the current driver version ($CUDA_DRIVER_VERSION) or a supported older version ($CUDA_DRIVERS_SUPPORTED) was not installed with this HPC SDK.));
error($if($BADCUDALIB,CUDA version $CUDAXDY is not available in this installation.));
error($if($and($MANAGED,$AUTOCOMPARE),The -acc=gpu suboptions managed and autocompare are not compatible with each other));
error($if($and($KNL,$contains($ACCELS,tesla)),OpenACC for Tesla GPU targets is not supported on Knights Landing host systems));

switch -Mqqa,qflag is hide #not
    help(Pass qqa flag to compiler)
    helpgroup(overall)
    append(IPAADD=-Mqqa,$qflag)
    append(CGARGS=-qqa $replace($qflag,",", ));

append USRDDEF=$if($ACCTESLAONLY,-DPGI_TESLA_TARGET) $if($ACCMULTIONLY,-DPGI_MULTICORE_TARGET);
append USRDEFDEF=$if($ACCTESLAONLY,-def PGI_TESLA_TARGET) $if($ACCMULTIONLY,-def PGI_MULTICORE_TARGET);

append USRDDEF=$if($ACCGPU, -D__NVCOMPILER_OPENACC_GPU) $if($ACCMC, -D__NVCOMPILER_OPENACC_MULTICORE) $if($ACCHOST, -D__NVCOMPILER_OPENACC_HOST);
append USRDEFDEF=$if($ACCGPU, -def __NVCOMPILER_OPENACC_GPU) $if($ACCMC, -def __NVCOMPILER_OPENACC_MULTICORE) $if($ACCHOST, -def __NVCOMPILER_OPENACC_HOST);

variable CUDAFORLIBSUF is default($if($expr($CUDAXXYY<11030),_110,
                                  $if($expr($CUDAXXYY<11080),_113,
                                  $if($expr($CUDAXXYY<12000),_118,
                                  $if($expr($CUDAXXYY<12070),_120,_128)))));
variable CUDAFORLIB_SELECTION is default($if($lor($equal($DRIVERLANG,Fortran),$ISCUDAFORLIBS), $(LIBSW)cudafor$CUDAFORLIBSUF));
switch -cudalib is
    help(Add appropriate versions of the CUDA-optimized libraries)
    helpgroup(linker)
    set(CUDALIBNEEDED=1)
    set(ISCUDALIB=1)

    # On Windows, the acc runtime isn't built, so do not require it.
    set(NEEDACCLIB=$if($or($and($notequal($TARGET,win64),$notequal($TARGET,win64-llvm)),$equal($NEEDACCLIB,1)),1,0))

    # Math and Comm Libraries Paths (double check)
    append(LDLIBSLIST=$if($NEEDCUDACOMMPATHS,$if($notequal($COMMLIBSLDDIR,),$COMMLIBSLDDIR)) $if($NEEDCUDAMATHPATHS,$if($notequal($CUDAMATHLIBDIR,),$CUDAMATHLIBDIR)) $if($NEEDCUPTI,$if($notequal($CUDACUPTILIBDIR,),$CUDACUPTILIBDIR)) $if($notequal($CUDALIBDIR,),$(LPRE)$CUDALIBDIR))

    # CUBLASMP
    append(LDLIBSLIST=$if($land($NEEDCUDACUBLASMP,$index($TARGET,linux86-64),$expr($CUDAXXYY>11040)),$(LPRE)cublasMp$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($land($NEEDCUDACUBLASMP,$expr($CUDAXXYY>=11020)),$(LIBSW_NOLIB)cublasmp$CUDALIBSTATIC
            $if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW_NOLIB)nvhpcwrapcublasmp$CUDALIBSTATIC)
            $if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW_NOLIB)nvhpcwrapcal$CUDALIBSTATIC)
            $PGIUNSTATICX $(LIBSW_NOLIB)cal $PGISTATICX))

    # CUFFT
    append(LDLIBSLIST=$if($NEEDCUDACUFFT,$(LPRE)cufft$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUDACUFFT,
            $if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW_NOLIB)nvhpcwrapcufft)
            $if($notequal($PGISTATICX,),$if($NEEDCUDACUFFTCALLBACK,$(LIBSW_NOLIB)cufft_static,$(LIBSW_NOLIB)cufft_static_nocallback),$(LIBSW_NOLIB)cufft)))

    # CUFFTW
    append(LDLIBSLIST=$if($NEEDCUDACUFFTW,$(LPRE)cufftw$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUDACUFFTW,$if($notequal($PGISTATICX,),$(LIBSW_NOLIB)cufftw_static $(LIBSW_NOLIB)cufft_static_nocallback,$(LIBSW_NOLIB)cufftw)))

    # CUFFTMP
    append(LDLIBSLIST=$if($land($NEEDCUDACUFFTMP,$lor($land($index($TARGET,linux86-64),$expr($CUDAXXYY>=11020)),$land($index($TARGET,linuxpower),$expr($CUDAXXYY>=11040)))),$(LPRE)cuFFTMp$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($land($NEEDCUDACUFFTMP,$lor($land($index($TARGET,linux86-64),$expr($CUDAXXYY>=11020)),$land($index($TARGET,linuxarm64),$expr($CUDAXXYY>=12000)))),
            $if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW_NOLIB)nvhpcwrapcufftmp)
            $(LIBSW_NOLIB)cufftMp$CUDALIBSTATIC))

    # CURAND
    append(LDLIBSLIST=$if($NEEDCUDACURAND,$(LPRE)curand$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUDACURAND,$if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW)cudaforwraprand) $(LIBSW_NOLIB)curand$CUDALIBSTATIC))

    # CUSOLVER
    append(LDLIBSLIST=$if($NEEDCUDACUSOLVER,$(LPRE)cusolver$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUDACUSOLVER,$(LIBSW_NOLIB)cusolver$CUDALIBSTATIC $if($notequal($PGISTATICX,),$(LIBSW_NOLIB)cusparse$CUDALIBSTATIC $if($expr($CUDAXXYY>=11070),$(LIBSW_NOLIB)cusolver_lapack$CUDALIBSTATIC,$(LIBSW_NOLIB)lapack$CUDALIBSTATIC)) $if($expr($CUDAXXYY>=12000),$(LIBSW_NOLIB)nvJitLink$CUDALIBSTATIC)))

    # CUSOLVERMP
    append(LDLIBSLIST=$if($land($NEEDCUDACUSOLVERMP,$expr($CUDAXXYY>11040)),$(LPRE)cusolverMp$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($land($NEEDCUDACUSOLVERMP,$expr($CUDAXXYY>11040)),$(LIBSW_NOLIB)cusolverMp$CUDALIBSTATIC 
            $if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW_NOLIB)nvhpcwrapcusolvermp$CUDALIBSTATIC)
            $if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW_NOLIB)nvhpcwrapcal$CUDALIBSTATIC)
            $if($expr($CUDAXXYY>=12000),$(LIBSW_NOLIB)nvJitLink$CUDALIBSTATIC)
            $PGIUNSTATICX $(LIBSW_NOLIB)cal $PGISTATICX))

    # CUSPARSE
    append(LDLIBSLIST=$if($NEEDCUDACUSPARSE,$(LPRE)cusparse$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUDACUSPARSE,
            $if($lor($ISFTN,$ISCUDAFORLIBS),
                $if($and($expr($CUDAXXYY>=11000),$expr($CUDAXXYY<12000)),$(LIBSW)cudaforwrapsparse11)
                $if($expr($CUDAXXYY>=12000),$(LIBSW)cudaforwrapsparse12))
            $(LIBSW_NOLIB)cusparse$CUDALIBSTATIC
            $if($expr($CUDAXXYY>=12000),$(LIBSW_NOLIB)nvJitLink$CUDALIBSTATIC)))

    # CUPTI
    append(LDLIBSLIST=$if($NEEDCUPTI,$(LPRE)cupti$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUPTI,$(LIBSW_NOLIB)cupti$CUDALIBSTATIC))

    # CUTENSOR
    append(LDLIBSLIST=$if($NEEDCUDACUTENSOR,$(LPRE)cutensor$CUDALIBSTATIC $(LPRE)cutensorMg$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUDACUTENSOR,
            $if($lor($ISFTN,$ISCUDAFORLIBS),
            $(LIBSW)cudaforwraptensor
                $if($expr($CUDAXXYY<11080),$(LIBSW)cudaforwraptensor_113)
                $if($expr($CUDAXXYY>=11080),$(LIBSW)cudaforwraptensor_118))
            $(LIBSW_NOLIB)cutensor$CUDALIBSTATIC $(LIBSW_NOLIB)cutensorMg$CUDALIBSTATIC))

    # NVBLAS
    # There is no static version so force dynamic linking
    append(LDLIBSLIST=$if($NEEDCUDANVBLAS,$(LPRE)nvblas))
    append(MATHCUDALIBLIST=$if($NEEDCUDANVBLAS,$PGIUNSTATICX $(LIBSW_NOLIB)nvblas $PGISTATICX))

    # NCCL
    append(LDLIBSLIST=$if($NEEDCUDANCCL,$(LPRE)nccl$CUDALIBSTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUDANCCL,$if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW_NOLIB)cudaforwrapnccl) $(LIBSW_NOLIB)nccl$CUDALIBSTATIC))

    # NVSHMEM
    append(LDLIBSLIST=$if($NEEDCUDANVSHMEM,$(LPRE)nvshmem_device $PGIUNSTATICX $(LPRE)nvshmem_host -L$(CUDALIBDIR)/stubs $(LPRE)nvidia-ml $PGISTATIC))
    append(MATHCUDALIBLIST=$if($NEEDCUDANVSHMEM,$(LIBSW_NOLIB)nvhpcwrapshmem $(LIBSW_NOLIB)nvshmem_device $PGIUNSTATICX $(LIBSW_NOLIB)nvshmem_host -L$(CUDALIBDIR)/stubs $(LIBSW_NOLIB)nvidia-ml $PGISTATIC))

    # NVLAMATH
    append(LDLIBSLIST=$if($NEEDCUDANVLAMATH,$(LPRE)nvlamath $(LPRE)blas$(NVLAMATH_SFX) $(LPRE)lapack$(NVLAMATH_SFX)))
    append(MATHCUDALIBLIST=$if($NEEDCUDANVLAMATH,$(LIBSW_NOLIB)nvlamath $(LIBSW_NOLIB)blas$(NVLAMATH_SFX) $(LIBSW_NOLIB)lapack$(NVLAMATH_SFX)))

    # NVTX3
    append(MATHCUDALIBLIST=$if($NEEDCUDANVTX3,$if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW_NOLIB)nvhpcwrapnvtx) $ASNEEDED $PGIUNSTATICX $if($expr($CUDAXXYY<12090),$(LIBSW_NOLIB)nvToolsExt,$(LIBSW_NOLIB)nvtx3interop) $PGISTATICX $NOASNEEDED))

    # CUBLAS
    append(LDLIBSLIST=$if($NEEDCUDACUBLAS,$(LPRE)cublas$CUDALIBSTATIC))
    # Needed when `cusolver` option is requested and we are statically linking
    append(MATHCUDALIBLIST=$if($or($NEEDCUDACUBLAS,$and($or($NEEDCUDACUSOLVER,$NEEDCUDACUTENSOR),$notequal($PGISTATICX,))),$(LIBSW_NOLIB)cublas$CUDALIBSTATIC
            $ifn($contains($COMPUTECAPS,30),$if($expr($CUDAXXYY>10000),$(LIBSW_NOLIB)cublasLt$CUDALIBSTATIC))
            $if($lor($ISFTN,$ISCUDAFORLIBS),$(LIBSW)cudaforwrapblas $if($expr($CUDAXXYY>11060),$(LIBSW)cudaforwrapblas117))))

    append(MATHCUDALIBLIST=$if($or($NEEDCUDACUBLAS,$NEEDCUDACUFFT,$NEEDCUDACUFFTW),$CULIBOSSTATIC))

    # CUDA Interoperability
    set(cudaforlib=)
    # when using -static-nvidia we need to group libcudaforXY.a and libcudafor.a to preserve the ordering and resolve symbols.
    append(cudaforlib=$CUDAFORLIB_SELECTION)
    append(cudaforlib=$if($equal($DRIVERLANG,Fortran),$(LIBSW)cudafor))
    append(cudaforlib=$if($expr($CUDAXXYY>=10010),$lookup($COMPLIBOBJ,cuda_init_register_end.$OBJSUFFIX)))
    set(cudafor2lib=$if($equal($DRIVERLANG,Fortran), $(LIBSW)cudafor2, $(LIBSW)cudanvhpc))
    # CUDA Interoperability

    keyword(
        cublas(
            set(NEEDCUBLAS=1)
        )
        cublasmp(
            error($if($index($TARGET,linuxpower),The option '-cudalib=cublasmp' is supported on x86_64 and aaarch64 architectures.))
            set(NEEDCUBLAS=1)
            set(NEEDCUBLASMP=1)
        )
        cufft(
            set(NEEDCUFFT=1)
            keyword(
                callback(
                    set(NEEDCUFFTCALLBACK=1)
                )
            )
        )
        cufftw(
            set(NEEDCUFFTW=1)
        )
        cufftmp(
            set(NEEDCUFFTMP=1)
        )
        curand(
            set(NEEDCURAND=1)
            append(ACCCGFLAGS=-x 186 0x200)
        )
        cusolver(
            set(NEEDCUSOLVER=1)
        )
        cusolvermp(
            set(NEEDCUSOLVERMP=1)
            set(NEEDNCCL=1)
        )
        cusparse(
            set(NEEDCUSPARSE=1)
        )
        cupti(
            set(NEEDCUPTI=1)
        )
        cutensor(
            set(NEEDCUTENSOR=1)
        )
        nvblas(
            set(NEEDNVBLAS=1)
        )
        nccl(
            set(NEEDNCCL=1)
        )
        nvshmem(
            set(NEEDNVSHMEM=1)
            append(ACCCGFLAGS=-x 186 0x100)
        )
        nvlamath(
            set(NEEDCUSOLVER=1)
            set(NEEDCUBLAS=1)
            set(NEEDCUTENSOR=1)
            set(NEEDNVLAMATH=1)
        )
        nvlamath_ilp64(
            set(NEEDCUSOLVER=1)
            set(NEEDCUBLAS=1)
            set(NEEDCUTENSOR=1)
            set(NEEDNVLAMATH=1)
            set(NVLAMATH_SFX="_ilp64")
        )
        nvtx3(
            set(NEEDNVTX3=1)
        )
    )
    nokeyword(
        set(NEEDCUDAALL=1)
    )

    append(CGARGS=$if($notequal($CUDAROOT,),-cudaroot $CUDAROOT))
    set(LCUDAFORLIB=$PGISTATICX $cudaforlib $PGIUNSTATICX)
    set(LCUDAFOR2LIB=$PGISTATICX $cudafor2lib $PGIUNSTATICX)
    set(CUDANEEDED=1)
    set(LRTLIB=$LRTLIBNAME)
    append(CGARGS=$if($equal($DRIVERLANG,Fortran),$DEFAULTCAPFLAG $TOOLKITFLAG))

    # Some math libraries need libstdc++ when statically linked, the nvshmem library is only static and it is always needed.
    append(DEFSTDLIBS=$if($or($notequal($PGISTATICX,),$NEEDCUDANVSHMEM),$(LIBSW)$CUBLASSTDLIB))
    append(CUDALIB=$if($or($and($NEEDCUDANVSHMEM,$not($NEEDACCLIB)),$NEEDCUDACUSPARSE),$(LIBSW_NOLIB)cuda))

    append(LDLIBS=$if($NEEDCUDALIB,$LDLIBSLIST))
    append(MATHCUDALIB=$if($NEEDCUDALIB,$PGISTATICX $CUDAASNEEDED $MATHCUDALIBLIST $LIBCULIBOS $(LIBSW_NOLIB)cudart$CUDALIBSTATIC $CUDAASNONEEDED $PGIUNSTATICX));

switch -cudaforlibs is
    help(Link in CUDA Fortran libraries, implies '-fortranlibs'.)
    set(ISCUDAFORLIBS=1)
    # CUDA Interoperability
    set(cudaforlib=)
    # when using -static-nvidia we need to group libcudaforXY.a and libcudafor.a to preserve the ordering and resolve symbols.
    append(cudaforlib=$CUDAFORLIB_SELECTION)
    append(cudaforlib=$(LIBSW)cudafor)
    append(cudaforlib=$if($expr($CUDAXXYY>=10010),$lookup($COMPLIBOBJ,cuda_init_register_end.$OBJSUFFIX)))
    set(cudafor2lib=$(LIBSW)cudafor2)
    # CUDA Interoperability
    append(CGARGS=$if($notequal($CUDAROOT,),-cudaroot $CUDAROOT))
    set(LCUDAFORLIB=$PGISTATICX $cudaforlib $PGIUNSTATICX)
    set(LCUDAFOR2LIB=$PGISTATICX $cudafor2lib $PGIUNSTATICX)
    set(CUDANEEDED=1)
    set(LRTLIB=$LRTLIBNAME)
    append(CGARGS=$DEFAULTCAPFLAG $TOOLKITFLAG)
    shorthand(-fortranlibs);

switch -cudalibs is hide shorthand(-cudalib);

switch -Mnoautoprivatize is hide
    help(Disable automatic privatization of arrays in nested scopes)
    append(CGARGS=-x 198 0x40000);

switch -Mautoprivatize is hide
    help(Enable automatic privatization of arrays in nested scopes)
    append(CGARGS=-y 198 0x40000);

# Local rcfile for enabling OpenACC stale data detection tool
cinclude rcfiles/acctoolsrc;

phase printcudaversion is forall
  before(NoCompile)
  set(detected=$action(cudadriver()))
  set(default=$action(cudatoolkit($COMPBASE/$CUDAMAJOR,,,$OLDESTCUDASHIPPED)))
  set(selected=$CUDAXDY)
  echo(CUDA Driver=$detected)
  echo($ifn($equal($default,$selected),Default CUDA Version=$default))
  echo(Selected CUDA Version=$selected)
  echo(CUDAXXYY=$CUDAXXYY)
  echo(CUDA Path=$USECUDAROOT);

switch -printcudaversion is hide
  help(Show CUDA driver and selected CUDA toolkit version)
  enable(printcudaversion)
  disable(NoFiles)
  enable(NoCompile)
  set(DETECTCUDA=1)
  stopafter(NoCompile);

phase printcudadir is forall
  before(NoCompile)
  set(detected=$action(cudadriver()))
  set(default=$action(cudatoolkit($COMPBASE/$CUDAMAJOR,,,$OLDESTCUDASHIPPED)))
  echo($USECUDAROOT/bin);

switch -printcudadir is hide
  help(Show selected CUDA toolkit bin directory)
  enable(printcudadir)
  disable(NoFiles)
  enable(NoCompile)
  set(DETECTCUDA=1)
  stopafter(NoCompile);

phase printmathlibsdir is forall
  before(NoCompile)
  set(detected=$action(cudadriver()))
  set(default=$action(cudatoolkit($COMPBASE/$CUDAMAJOR,,,$OLDESTCUDASHIPPED)))
  echo($CUDAMATHBINDIR);

switch -printmathlibsdir is hide
  help(Show selected CUDA toolkit math_libs bin directory)
  enable(printmathlibsdir)
  disable(NoFiles)
  enable(NoCompile)
  set(DETECTCUDA=1)
  stopafter(NoCompile);

switch -nvflangcudabclibs is hide
  help(Link F18 CUDA BC runtime libraries)
  append(CGARGS=-x 192 0x400000);

switch -nonvflangcudabclibs is hide
  help(Do not link F18 CUDA BC runtime libraries)
  append(CGARGS=-y 192 0x400000);
