======== MAKING CUSTOM LIB =================
make -C ./ti_dl/custom -f makefile
make[1]: Entering directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/custom'
compiling tidl_custom.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidl_custom.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++LoeeV_QwCEZU.il tidl_custom.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_custom.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_custom.c -il /tmp/nvc++LoeeV_QwCEZU.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++LoeeV_QwCEZU.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom.obj' -asm /tmp/nvc++1oeeFzzHc1-6.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++1oeeFzzHc1-6.ll -S -o /tmp/nvc++DoeexLe7RX_h.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++DoeexLe7RX_h.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom.obj
Unlinking /tmp/nvc++LoeeV_QwCEZU.il
Unlinking /tmp/nvc++noeeN0IsCLBa.s
Unlinking /tmp/nvc++1oeeFzzHc1-6.ll
Unlinking /tmp/nvc++DoeexLe7RX_h.llvm
compiling tidl_custom_maxpooling.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidl_custom_maxpooling.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++Owee4APrJ0oH.il tidl_custom_maxpooling.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_custom_maxpooling.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_custom_maxpooling.c -il /tmp/nvc++Owee4APrJ0oH.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++Owee4APrJ0oH.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom_maxpooling.obj' -asm /tmp/nvc++Owee4aB9X5Jf.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++Owee4aB9X5Jf.ll -S -o /tmp/nvc++Owee4hZ8buR3.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++Owee4hZ8buR3.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom_maxpooling.obj
Unlinking /tmp/nvc++Owee4APrJ0oH.il
Unlinking /tmp/nvc++Owee4X0YkVS1.s
Unlinking /tmp/nvc++Owee4aB9X5Jf.ll
Unlinking /tmp/nvc++Owee4hZ8buR3.llvm
compiling tidsp/tidl_custom_maxpool_ixX_oxX.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidsp/tidl_custom_maxpool_ixX_oxX.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++nEeeN_q9jtyL.il tidsp/tidl_custom_maxpool_ixX_oxX.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidsp/tidl_custom_maxpool_ixX_oxX.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidsp/tidl_custom_maxpool_ixX_oxX.c -il /tmp/nvc++nEeeN_q9jtyL.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++nEeeN_q9jtyL.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX.obj' -asm /tmp/nvc++DEeexpF-ub3W.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++DEeexpF-ub3W.ll -S -o /tmp/nvc++fEeepk59nPHz.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++fEeepk59nPHz.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX.obj
Unlinking /tmp/nvc++nEeeN_q9jtyL.il
Unlinking /tmp/nvc++1EeeFZ4Hh1Gz.s
Unlinking /tmp/nvc++DEeexpF-ub3W.ll
Unlinking /tmp/nvc++fEeepk59nPHz.llvm
compiling tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++cMeegc-VVWBZ.il tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c -il /tmp/nvc++cMeegc-VVWBZ.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++cMeegc-VVWBZ.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_c7x.obj' -asm /tmp/nvc++IMeeMPXq8XYj.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++IMeeMPXq8XYj.ll -S -o /tmp/nvc++YMeew2_VJ2A3.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++YMeew2_VJ2A3.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_c7x.obj
Unlinking /tmp/nvc++cMeegc-VVWBZ.il
Unlinking /tmp/nvc++sMee2s3451dA.s
Unlinking /tmp/nvc++IMeeMPXq8XYj.ll
Unlinking /tmp/nvc++YMeew2_VJ2A3.llvm
compiling tidsp/tidl_custom_maxpool_ixX_oxX_cn.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidsp/tidl_custom_maxpool_ixX_oxX_cn.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++pUeeTglS7OkS.il tidsp/tidl_custom_maxpool_ixX_oxX_cn.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidsp/tidl_custom_maxpool_ixX_oxX_cn.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidsp/tidl_custom_maxpool_ixX_oxX_cn.c -il /tmp/nvc++pUeeTglS7OkS.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++pUeeTglS7OkS.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_cn.obj' -asm /tmp/nvc++-Uee9Ed55VHz.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++-Uee9Ed55VHz.ll -S -o /tmp/nvc++3UeeLw-Cpy1d.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++3UeeLw-Cpy1d.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_cn.obj
Unlinking /tmp/nvc++pUeeTglS7OkS.il
Unlinking /tmp/nvc++hUeev8drKhG7.s
Unlinking /tmp/nvc++-Uee9Ed55VHz.ll
Unlinking /tmp/nvc++3UeeLw-Cpy1d.llvm
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom_maxpooling.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_c7x.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_cn.obj
make[1]: Leaving directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/custom'
======== MAKING TIDL ALGO =================
make -C ./ti_dl/algo -f makefile
make[1]: Entering directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo'
compiling src/printv.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/printv.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++TjfehdWpwH7G.il src/printv.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/printv.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/printv.c -il /tmp/nvc++TjfehdWpwH7G.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++TjfehdWpwH7G.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/printv.obj' -asm /tmp/nvc++9jfe37Z4LCWP.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++9jfe37Z4LCWP.ll -S -o /tmp/nvc++LjfeVAZXFyXa.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++LjfeVAZXFyXa.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/printv.obj
Unlinking /tmp/nvc++TjfehdWpwH7G.il
Unlinking /tmp/nvc++vjfe-Bq438AP.s
Unlinking /tmp/nvc++9jfe37Z4LCWP.ll
Unlinking /tmp/nvc++LjfeVAZXFyXa.llvm
compiling src/tidl_alg.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_alg.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++VrfenVy2K3Yx.il src/tidl_alg.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_alg.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_alg.c -il /tmp/nvc++VrfenVy2K3Yx.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++VrfenVy2K3Yx.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg.obj' -asm /tmp/nvc++FrfeDVtux-J1.ll
TIDL_activate(IALG_Obj*):
   2489, Generating enter data create(scratch_buffer_base[:scratch_buffer_size])
   2495, Generating enter data copyin(coeff_buffer_base[:coeff_buffer_size])
TIDL_deactivate(IALG_Obj*):
   2557, Generating exit data delete(scratch_buffer_base[:scratch_buffer_size])
   2568, Generating exit data delete(coeff_buffer_base[:coeff_buffer_size])
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++FrfeDVtux-J1.ll -S -o /tmp/nvc++xrfefRrcDBT5.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++xrfefRrcDBT5.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg.obj
Unlinking /tmp/nvc++VrfenVy2K3Yx.il
Unlinking /tmp/nvc++Nrfe1O_CvlJL.s
Unlinking /tmp/nvc++FrfeDVtux-J1.ll
Unlinking /tmp/nvc++xrfefRrcDBT5.llvm
compiling src/tidl_alg_utils.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_alg_utils.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++WzfeqF7IZVWD.il src/tidl_alg_utils.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_alg_utils.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_alg_utils.c -il /tmp/nvc++WzfeqF7IZVWD.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++WzfeqF7IZVWD.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg_utils.obj' -asm /tmp/nvc++WzfeqJs9XG1Q.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++WzfeqJs9XG1Q.ll -S -o /tmp/nvc++Wzfeq9MFIL2Z.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++Wzfeq9MFIL2Z.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg_utils.obj
Unlinking /tmp/nvc++WzfeqF7IZVWD.il
Unlinking /tmp/nvc++WzfeqzxY6ceQ.s
Unlinking /tmp/nvc++WzfeqJs9XG1Q.ll
Unlinking /tmp/nvc++Wzfeq9MFIL2Z.llvm
compiling src/tidl_argmax.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_argmax.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++ZHfezB2aiwr6.il src/tidl_argmax.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_argmax.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_argmax.c -il /tmp/nvc++ZHfezB2aiwr6.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++ZHfezB2aiwr6.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_argmax.obj' -asm /tmp/nvc++JHfePQ4N7KBJ.ll
int TIDL_argmaxRefProcess<signed char, unsigned char>(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, signed char*, unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int):
    127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(imHeight,numTotRoi,imWidth)
         Generating NVIDIA GPU code
        127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        129,   /* blockIdx.x threadIdx.x collapsed */
        131,   /* blockIdx.x threadIdx.x collapsed */
        135, #pragma acc loop seq
    127, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present]
    131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx)
    135, Loop carried scalar dependence for maxVal at line 145
         Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch)
         Loop carried scalar dependence for maxVal at line 145
    149, Accelerator restriction: induction variable live-out from loop: maxIdx
int TIDL_argmaxRefProcess<unsigned char, unsigned char>(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, unsigned char*, unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int):
    127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(imHeight,numTotRoi,imWidth)
         Generating NVIDIA GPU code
        127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        129,   /* blockIdx.x threadIdx.x collapsed */
        131,   /* blockIdx.x threadIdx.x collapsed */
        135, #pragma acc loop seq
    127, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present]
    131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx)
    135, Loop carried scalar dependence for maxVal at line 145
         Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch)
         Loop carried scalar dependence for maxVal at line 145
    149, Accelerator restriction: induction variable live-out from loop: maxIdx
int TIDL_argmaxRefProcess<short, unsigned short>(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, short*, unsigned short*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int):
    127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(imHeight,numTotRoi,imWidth)
         Generating NVIDIA GPU code
        127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        129,   /* blockIdx.x threadIdx.x collapsed */
        131,   /* blockIdx.x threadIdx.x collapsed */
        135, #pragma acc loop seq
    127, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present]
    131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx)
    135, Loop carried scalar dependence for maxVal at line 145
         Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch)
         Loop carried scalar dependence for maxVal at line 145
    149, Accelerator restriction: induction variable live-out from loop: maxIdx
int TIDL_argmaxRefProcess<unsigned short, unsigned short>(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, unsigned short*, unsigned short*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int):
    127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(imHeight,numTotRoi,imWidth)
         Generating NVIDIA GPU code
        127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        129,   /* blockIdx.x threadIdx.x collapsed */
        131,   /* blockIdx.x threadIdx.x collapsed */
        135, #pragma acc loop seq
    127, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present]
    131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx)
    135, Loop carried scalar dependence for maxVal at line 145
         Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch)
         Loop carried scalar dependence for maxVal at line 145
    149, Accelerator restriction: induction variable live-out from loop: maxIdx
int TIDL_argmaxRefProcess<float, float>(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, float*, float*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int):
    127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(imHeight,numTotRoi,imWidth)
         Generating NVIDIA GPU code
        127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        129,   /* blockIdx.x threadIdx.x collapsed */
        131,   /* blockIdx.x threadIdx.x collapsed */
        135, #pragma acc loop seq
    127, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present]
    131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx)
    135, Loop carried scalar dependence for maxVal at line 145
         Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch)
         Loop carried scalar dependence for maxVal at line 145
    149, Accelerator restriction: induction variable live-out from loop: maxIdx
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccDKfexKywgT5i.gpu -computecap 86 -ptx /tmp/nvaccTKfehbgcwdcy.ptx -o /tmp/nvaccvKfe-jw99DYx.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_argmax.c -sm 86 /tmp/nvaccvKfe-jw99DYx.bin -compute 86 /tmp/nvaccTKfehbgcwdcy.ptx -o /tmp/nvacc9Kfe3zNMemMr.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++JHfePQ4N7KBJ.ll -S -o /tmp/nvc++7HfeX-DKbBmR.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++7HfeX-DKbBmR.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_argmax.obj
Unlinking /tmp/nvc++ZHfezB2aiwr6.il
Unlinking /tmp/nvc++lHfeH2jnP4pn.s
Unlinking /tmp/nvc++JHfePQ4N7KBJ.ll
Unlinking /tmp/nvc++7HfeX-DKbBmR.llvm
compiling src/tidl_batchNorm.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_batchNorm.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++0UfeCh8txGPW.il src/tidl_batchNorm.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_batchNorm.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_batchNorm.c -il /tmp/nvc++0UfeCh8txGPW.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++0UfeCh8txGPW.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchNorm.obj' -asm /tmp/nvc++0UfeCAfdvcnY.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++0UfeCAfdvcnY.ll -S -o /tmp/nvc++uUfe8TWfMFpY.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++uUfe8TWfMFpY.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchNorm.obj
Unlinking /tmp/nvc++0UfeCh8txGPW.il
Unlinking /tmp/nvc++uUfe8cD9_fkL.s
Unlinking /tmp/nvc++0UfeCAfdvcnY.ll
Unlinking /tmp/nvc++uUfe8TWfMFpY.llvm
compiling src/tidl_batchReshape.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_batchReshape.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++42feO3YM4OxS.il src/tidl_batchReshape.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_batchReshape.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_batchReshape.c -il /tmp/nvc++42feO3YM4OxS.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++42feO3YM4OxS.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchReshape.obj' -asm /tmp/nvc++42feOzy_5jgx.ll
void TIDL_refBatchReshape<float, float>(float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int):
    119, Generating copyin(pIn[:width+((inLinePitch*(height-1))+((inChPitch*(numChs-1))+((inBatchPitch*(numBatches-1))+inPtrOffset)))]) [if not already present]
         Generating copy(pOut[:width+(((height-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+outPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(numChs,numBatches,height,width)
         Generating NVIDIA GPU code
        119, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        121,   /* blockIdx.x threadIdx.x collapsed */
        123,   /* blockIdx.x threadIdx.x collapsed */
        125,   /* blockIdx.x threadIdx.x collapsed */
    125, Generating implicit firstprivate(inChPitch,inBatchPitch,inLinePitch,outChPitch,outBatchPitch,outPtrOffset,outLinePitch,inPtrOffset)
void TIDL_refBatchReshape<unsigned char, unsigned char>(unsigned char const*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int):
    119, Generating copyin(pIn[:width+((inLinePitch*(height-1))+((inChPitch*(numChs-1))+((inBatchPitch*(numBatches-1))+inPtrOffset)))]) [if not already present]
         Generating copy(pOut[:width+(((height-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+outPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(numChs,width,numBatches,height)
         Generating NVIDIA GPU code
        119, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        121,   /* blockIdx.x threadIdx.x collapsed */
        123,   /* blockIdx.x threadIdx.x collapsed */
        125,   /* blockIdx.x threadIdx.x collapsed */
    125, Generating implicit firstprivate(inChPitch,inBatchPitch,inLinePitch,outChPitch,outBatchPitch,outPtrOffset,outLinePitch,inPtrOffset)
void TIDL_refBatchReshape<unsigned short, unsigned short>(unsigned short const*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int):
    119, Generating copyin(pIn[:width+((inLinePitch*(height-1))+((inChPitch*(numChs-1))+((inBatchPitch*(numBatches-1))+inPtrOffset)))]) [if not already present]
         Generating copy(pOut[:width+(((height-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+outPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(numChs,numBatches,height,width)
         Generating NVIDIA GPU code
        119, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        121,   /* blockIdx.x threadIdx.x collapsed */
        123,   /* blockIdx.x threadIdx.x collapsed */
        125,   /* blockIdx.x threadIdx.x collapsed */
    125, Generating implicit firstprivate(inChPitch,inBatchPitch,inLinePitch,outChPitch,outBatchPitch,outPtrOffset,outLinePitch,inPtrOffset)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacc05feC3FjxMFt.gpu -computecap 86 -ptx /tmp/nvacc05feCkv1xF-g.ptx -o /tmp/nvaccu5fe8D_j0z78.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_batchReshape.c -sm 86 /tmp/nvaccu5fe8D_j0z78.bin -compute 86 /tmp/nvacc05feCkv1xF-g.ptx -o /tmp/nvacc05feCnYt1Pws.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++42feOzy_5jgx.ll -S -o /tmp/nvc++42feOMBnRDCq.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++42feOMBnRDCq.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchReshape.obj
Unlinking /tmp/nvc++42feO3YM4OxS.il
Unlinking /tmp/nvc++42feO0j5Zys3.s
Unlinking /tmp/nvc++42feOzy_5jgx.ll
Unlinking /tmp/nvc++42feOMBnRDCq.llvm
compiling src/tidl_colorConversion.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_colorConversion.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++IdgeMRKpCyxR.il src/tidl_colorConversion.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_colorConversion.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_colorConversion.c -il /tmp/nvc++IdgeMRKpCyxR.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++IdgeMRKpCyxR.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_colorConversion.obj' -asm /tmp/nvc++cdgeglBVGWO8.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++cdgeglBVGWO8.ll -S -o /tmp/nvc++sdge2J0tIrL7.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++sdge2J0tIrL7.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_colorConversion.obj
Unlinking /tmp/nvc++IdgeMRKpCyxR.il
Unlinking /tmp/nvc++YdgewikL9iN3.s
Unlinking /tmp/nvc++cdgeglBVGWO8.ll
Unlinking /tmp/nvc++sdge2J0tIrL7.llvm
compiling src/tidl_commonUtils.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_commonUtils.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++-lge97wn0hEL.il src/tidl_commonUtils.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_commonUtils.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_commonUtils.c -il /tmp/nvc++-lge97wn0hEL.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++-lge97wn0hEL.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_commonUtils.obj' -asm /tmp/nvc++Vlgen4bPMJF8.ll
TIDL_getSaturationFloat(sTIDL_Layer_t*, float*, float*):
   2968, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
TIDL_floatSat(float, sTIDL_Layer_t*):
   3013, Generating acc routine seq
         Generating NVIDIA GPU code
TIDL_checkPixelInPadRegion(int, int, int, int, int, int):
   3267, Generating acc routine seq
         Generating NVIDIA GPU code
TIDL_convertFloatToScaleAndShift(float, int*, int*, int):
   3390, Generating acc routine seq
         Generating NVIDIA GPU code
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacc7ogeXQnwqa5U.gpu -computecap 86 -ptx /tmp/nvaccRogebitYayjx.ptx -o /tmp/nvaccdogejimgBy0u.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_commonUtils.c -sm 86 /tmp/nvaccdogejimgBy0u.bin -compute 86 /tmp/nvaccRogebitYayjx.ptx -o /tmp/nvaccBogerjnfsA3J.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++Vlgen4bPMJF8.ll -S -o /tmp/nvc++Nlge15Wen5OP.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++Nlge15Wen5OP.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_commonUtils.obj
Unlinking /tmp/nvc++-lge97wn0hEL.il
Unlinking /tmp/nvc++3lgeLBUZLzrw.s
Unlinking /tmp/nvc++Vlgen4bPMJF8.ll
Unlinking /tmp/nvc++Nlge15Wen5OP.llvm
compiling src/tidl_concat.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_concat.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++dygejyCDyszz.il src/tidl_concat.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_concat.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_concat.c -il /tmp/nvc++dygejyCDyszz.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++dygejyCDyszz.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_concat.obj' -asm /tmp/nvc++ZygezJ6dl7dc.ll
void TIDL_refConcat<signed char, int>(signed char*, int*, int, tidlConcatBuffParams_t*):
    156, Generating implicit copyin(concatBuffParams) [if not already present]
         Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present]
         Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))])
         Generating NVIDIA GPU code
        156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        158,   /* blockIdx.x threadIdx.x collapsed */
        160,   /* blockIdx.x threadIdx.x collapsed */
    160, Generating implicit firstprivate(scale)
void TIDL_refConcat<unsigned char, int>(unsigned char*, int*, int, tidlConcatBuffParams_t*):
    156, Generating implicit copyin(concatBuffParams) [if not already present]
         Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present]
         Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))])
         Generating NVIDIA GPU code
        156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        158,   /* blockIdx.x threadIdx.x collapsed */
        160,   /* blockIdx.x threadIdx.x collapsed */
    160, Generating implicit firstprivate(scale)
void TIDL_refConcat<short, int>(short*, int*, int, tidlConcatBuffParams_t*):
    156, Generating implicit copyin(concatBuffParams) [if not already present]
         Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present]
         Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))])
         Generating NVIDIA GPU code
        156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        158,   /* blockIdx.x threadIdx.x collapsed */
        160,   /* blockIdx.x threadIdx.x collapsed */
    160, Generating implicit firstprivate(scale)
void TIDL_refConcat<unsigned short, int>(unsigned short*, int*, int, tidlConcatBuffParams_t*):
    156, Generating implicit copyin(concatBuffParams) [if not already present]
         Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present]
         Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))])
         Generating NVIDIA GPU code
        156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        158,   /* blockIdx.x threadIdx.x collapsed */
        160,   /* blockIdx.x threadIdx.x collapsed */
    160, Generating implicit firstprivate(scale)
void TIDL_refConcat<float, float>(float*, float*, int, tidlConcatBuffParams_t*):
    156, Generating implicit copyin(concatBuffParams) [if not already present]
         Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present]
         Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))])
         Generating NVIDIA GPU code
        156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        158,   /* blockIdx.x threadIdx.x collapsed */
        160,   /* blockIdx.x threadIdx.x collapsed */
    160, Generating implicit firstprivate(scale)
void TIDL_refConcatQuantize<int, signed char>(TIDL_Obj*, int, int*, signed char*, int, tidlConcatBuffParams_t*, int, int):
    210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating NVIDIA GPU code
        210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        212,   /* blockIdx.x threadIdx.x collapsed */
        214,   /* blockIdx.x threadIdx.x collapsed */
    210, Generating implicit copy(max,min) [if not already present]
         Generating implicit copyin(concatBuffParams) [if not already present]
    214, Generating implicit firstprivate(outAcc)
    243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present]
         Generating implicit copy(net) [if not already present]
         Generating NVIDIA GPU code
        243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        245,   /* blockIdx.x threadIdx.x collapsed */
        247,   /* blockIdx.x threadIdx.x collapsed */
    243, Generating implicit copyin(concatBuffParams) [if not already present]
    247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits)
void TIDL_refConcatQuantize<int, unsigned char>(TIDL_Obj*, int, int*, unsigned char*, int, tidlConcatBuffParams_t*, int, int):
    210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating NVIDIA GPU code
        210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        212,   /* blockIdx.x threadIdx.x collapsed */
        214,   /* blockIdx.x threadIdx.x collapsed */
    210, Generating implicit copy(max,min) [if not already present]
         Generating implicit copyin(concatBuffParams) [if not already present]
    214, Generating implicit firstprivate(outAcc)
    243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present]
         Generating implicit copy(net) [if not already present]
         Generating NVIDIA GPU code
        243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        245,   /* blockIdx.x threadIdx.x collapsed */
        247,   /* blockIdx.x threadIdx.x collapsed */
    243, Generating implicit copyin(concatBuffParams) [if not already present]
    247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits)
void TIDL_refConcatQuantize<int, short>(TIDL_Obj*, int, int*, short*, int, tidlConcatBuffParams_t*, int, int):
    210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating NVIDIA GPU code
        210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        212,   /* blockIdx.x threadIdx.x collapsed */
        214,   /* blockIdx.x threadIdx.x collapsed */
    210, Generating implicit copy(max,min) [if not already present]
         Generating implicit copyin(concatBuffParams) [if not already present]
    214, Generating implicit firstprivate(outAcc)
    243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present]
         Generating implicit copy(net) [if not already present]
         Generating NVIDIA GPU code
        243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        245,   /* blockIdx.x threadIdx.x collapsed */
        247,   /* blockIdx.x threadIdx.x collapsed */
    243, Generating implicit copyin(concatBuffParams) [if not already present]
    247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits)
void TIDL_refConcatQuantize<int, unsigned short>(TIDL_Obj*, int, int*, unsigned short*, int, tidlConcatBuffParams_t*, int, int):
    210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating NVIDIA GPU code
        210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        212,   /* blockIdx.x threadIdx.x collapsed */
        214,   /* blockIdx.x threadIdx.x collapsed */
    210, Generating implicit copy(max,min) [if not already present]
         Generating implicit copyin(concatBuffParams) [if not already present]
    214, Generating implicit firstprivate(outAcc)
    243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present]
         Generating implicit copy(net) [if not already present]
         Generating NVIDIA GPU code
        243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        245,   /* blockIdx.x threadIdx.x collapsed */
        247,   /* blockIdx.x threadIdx.x collapsed */
    243, Generating implicit copyin(concatBuffParams) [if not already present]
    247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits)
void TIDL_refConcatQuantize<float, float>(TIDL_Obj*, int, float*, float*, int, tidlConcatBuffParams_t*, int, int):
    210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating NVIDIA GPU code
        210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        212,   /* blockIdx.x threadIdx.x collapsed */
        214,   /* blockIdx.x threadIdx.x collapsed */
    210, Generating implicit copy(max,min) [if not already present]
         Generating implicit copyin(concatBuffParams) [if not already present]
    214, Generating implicit firstprivate(outAcc)
    243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))])
         Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present]
         Generating implicit copy(net) [if not already present]
         Generating NVIDIA GPU code
        243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        245,   /* blockIdx.x threadIdx.x collapsed */
        247,   /* blockIdx.x threadIdx.x collapsed */
    243, Generating implicit copyin(concatBuffParams) [if not already present]
    247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccKBgeS2A_hKqM.gpu -computecap 86 -ptx /tmp/nvaccKBgeSn-JhP9g.ptx -o /tmp/nvacceBgem74LKXQK.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_concat.c -sm 86 /tmp/nvacceBgem74LKXQK.bin -compute 86 /tmp/nvaccKBgeSn-JhP9g.ptx -o /tmp/nvaccKBgeSuahL-aG.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++ZygezJ6dl7dc.ll -S -o /tmp/nvc++lygeH_UOXbJH.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++lygeH_UOXbJH.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_concat.obj
Unlinking /tmp/nvc++dygejyCDyszz.il
Unlinking /tmp/nvc++BygerzFw5Asu.s
Unlinking /tmp/nvc++ZygezJ6dl7dc.ll
Unlinking /tmp/nvc++lygeH_UOXbJH.llvm
compiling src/tidl_const.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_const.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++bLged4lQmIW7.il src/tidl_const.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_const.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_const.c -il /tmp/nvc++bLged4lQmIW7.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++bLged4lQmIW7.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_const.obj' -asm /tmp/nvc++rLgeZ7bQAYw_.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++rLgeZ7bQAYw_.ll -S -o /tmp/nvc++zLgelAXMVKMt.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++zLgelAXMVKMt.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_const.obj
Unlinking /tmp/nvc++bLged4lQmIW7.il
Unlinking /tmp/nvc++jLgeB4kaDOZd.s
Unlinking /tmp/nvc++rLgeZ7bQAYw_.ll
Unlinking /tmp/nvc++zLgelAXMVKMt.llvm
compiling src/tidl_conv2d_base.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_conv2d_base.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++gTgesODz164r.il src/tidl_conv2d_base.c
"src/tidl_conv2d_base.c", line 94: warning: incompatible redefinition of macro "_POSIX_C_SOURCE" (declared at line 290 of "/usr/include/features.h") [bad_macro_redef]
  #define _POSIX_C_SOURCE 200112L
          ^

Remark: individual warnings can be suppressed with "--diag_suppress <warning-name>"


/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_conv2d_base.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_conv2d_base.c -il /tmp/nvc++gTgesODz164r.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++gTgesODz164r.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_conv2d_base.obj' -asm /tmp/nvc++MTgeY_F0xL-i.ll
long TIDL_openaccShiftRight<long>(long, int):
     78, include "tidl_alg_int.h"
        1432, Generating acc routine seq
              Generating NVIDIA GPU code
long TIDL_openaccShiftRightImpl<long>(long, int):
     78, include "tidl_alg_int.h"
        1426, Generating acc routine seq
              Generating NVIDIA GPU code
void TIDL_refConv2d<float, float, float, float, float>(float*, float*, float*, float*, float*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, float, float, float, float>(float*, float*, float*, float*, float*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, float, float, float, float>(float*, float*, float*, float*, float*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<float, float, float, float>(float const*, float const*, float const*, float*, float*, float*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
float TIDL_findMinMaxForChQuant<float>(float*, int, int, int, int, int, int, int, int, int, int, int, float*, float, float*, float*):
    178, Generating copyin(perChannelWeightScalePtr[:numGroups]) [if not already present]
         Generating implicit copyin(min) [if not already present]
         Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outBatchPitch*(numBatches-1))+(outChPitch*(numOutChannels*(numGroups-1)))))+1])
         Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numGroups,numBatches,height)
         Generating NVIDIA GPU code
        178, #pragma acc loop seq collapse(2)
        180,   collapsed */
        183, #pragma acc loop seq
        186, #pragma acc loop seq
        189, #pragma acc loop seq
    178, Generating implicit copyin(max) [if not already present]
    180, Complex loop carried dependence of max-> prevents parallelization
         Loop carried scalar dependence for absMax at line 242
         Complex loop carried dependence of min-> prevents parallelization
         Generating implicit firstprivate(absMax,maxChIdx)
         Complex loop carried dependence of max-> prevents parallelization
    189, Generating implicit firstprivate(inDataVal,outBatchPitch,outImPitch,tensorScale,outdataOffset,outChPitch,inDataFloat,accScale)
    205, Accelerator restriction: induction variable live-out from loop: maxChIdx
    210, Accelerator restriction: induction variable live-out from loop: maxChIdx
void TIDL_refConv2d<signed char, signed char, int, signed char, int>(signed char*, signed char*, int*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, signed char, signed char, int, int>(signed char*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, signed char, signed char, int, int>(signed char*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<signed char, signed char, int, int>(signed char const*, signed char const*, int const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
float TIDL_findMinMaxForChQuant<int>(int*, int, int, int, int, int, int, int, int, int, int, int, float*, float, float*, float*):
    178, Generating copyin(perChannelWeightScalePtr[:numGroups]) [if not already present]
         Generating implicit copyin(min) [if not already present]
         Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outBatchPitch*(numBatches-1))+(outChPitch*(numOutChannels*(numGroups-1)))))+1])
         Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numGroups,numBatches,height)
         Generating NVIDIA GPU code
        178, #pragma acc loop seq collapse(2)
        180,   collapsed */
        183, #pragma acc loop seq
        186, #pragma acc loop seq
        189, #pragma acc loop seq
    178, Generating implicit copyin(max) [if not already present]
    180, Complex loop carried dependence of max-> prevents parallelization
         Loop carried scalar dependence for absMax at line 242
         Complex loop carried dependence of min-> prevents parallelization
         Generating implicit firstprivate(absMax,maxChIdx)
         Complex loop carried dependence of max-> prevents parallelization
    189, Generating implicit firstprivate(inDataVal,outBatchPitch,outImPitch,tensorScale,outdataOffset,outChPitch,inDataFloat,accScale)
    205, Accelerator restriction: induction variable live-out from loop: maxChIdx
    210, Accelerator restriction: induction variable live-out from loop: maxChIdx
void TIDL_refConv2d<signed char, signed char, int, unsigned char, int>(signed char*, signed char*, int*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, signed char, int, short, int>(signed char*, signed char*, int*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, signed char, int, unsigned short, int>(signed char*, signed char*, int*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, signed char, int, signed char, int>(unsigned char*, signed char*, int*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, unsigned char, signed char, int, int>(unsigned char*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, unsigned char, signed char, int, int>(unsigned char*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<unsigned char, signed char, int, int>(unsigned char const*, signed char const*, int const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<unsigned char, signed char, int, unsigned char, int>(unsigned char*, signed char*, int*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, signed char, int, short, int>(unsigned char*, signed char*, int*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, signed char, int, unsigned short, int>(unsigned char*, signed char*, int*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, signed char, int, signed char, int>(short*, signed char*, int*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, short, signed char, int, int>(short*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, short, signed char, int, int>(short*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<short, signed char, int, int>(short const*, signed char const*, int const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<short, signed char, int, unsigned char, int>(short*, signed char*, int*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, signed char, int, short, int>(short*, signed char*, int*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, signed char, int, unsigned short, int>(short*, signed char*, int*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, signed char, int, signed char, int>(unsigned short*, signed char*, int*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, unsigned short, signed char, int, int>(unsigned short*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, unsigned short, signed char, int, int>(unsigned short*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<unsigned short, signed char, int, int>(unsigned short const*, signed char const*, int const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<unsigned short, signed char, int, unsigned char, int>(unsigned short*, signed char*, int*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, signed char, int, short, int>(unsigned short*, signed char*, int*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, signed char, int, unsigned short, int>(unsigned short*, signed char*, int*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, signed char, short, signed char, int>(signed char*, signed char*, short*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, signed char, signed char, short, int>(signed char*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, signed char, signed char, short, int>(signed char*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<signed char, signed char, short, int>(signed char const*, signed char const*, short const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<signed char, signed char, short, unsigned char, int>(signed char*, signed char*, short*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, signed char, short, short, int>(signed char*, signed char*, short*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, signed char, short, unsigned short, int>(signed char*, signed char*, short*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, signed char, short, signed char, int>(unsigned char*, signed char*, short*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, unsigned char, signed char, short, int>(unsigned char*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, unsigned char, signed char, short, int>(unsigned char*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<unsigned char, signed char, short, int>(unsigned char const*, signed char const*, short const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<unsigned char, signed char, short, unsigned char, int>(unsigned char*, signed char*, short*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, signed char, short, short, int>(unsigned char*, signed char*, short*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, signed char, short, unsigned short, int>(unsigned char*, signed char*, short*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, signed char, short, signed char, int>(short*, signed char*, short*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, short, signed char, short, int>(short*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, short, signed char, short, int>(short*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<short, signed char, short, int>(short const*, signed char const*, short const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<short, signed char, short, unsigned char, int>(short*, signed char*, short*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, signed char, short, short, int>(short*, signed char*, short*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, signed char, short, unsigned short, int>(short*, signed char*, short*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, signed char, short, signed char, int>(unsigned short*, signed char*, short*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, unsigned short, signed char, short, int>(unsigned short*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, unsigned short, signed char, short, int>(unsigned short*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<unsigned short, signed char, short, int>(unsigned short const*, signed char const*, short const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<unsigned short, signed char, short, unsigned char, int>(unsigned short*, signed char*, short*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, signed char, short, short, int>(unsigned short*, signed char*, short*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, signed char, short, unsigned short, int>(unsigned short*, signed char*, short*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, short, long, signed char, long>(signed char*, short*, long*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, signed char, short, long, long>(signed char*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, signed char, short, long, long>(signed char*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<signed char, short, long, long>(signed char const*, short const*, long const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
float TIDL_findMinMaxForChQuant<long>(long*, int, int, int, int, int, int, int, int, int, int, int, float*, float, float*, float*):
    178, Generating copyin(perChannelWeightScalePtr[:numGroups]) [if not already present]
         Generating implicit copyin(min) [if not already present]
         Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outBatchPitch*(numBatches-1))+(outChPitch*(numOutChannels*(numGroups-1)))))+1])
         Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numGroups,numBatches,height)
         Generating NVIDIA GPU code
        178, #pragma acc loop seq collapse(2)
        180,   collapsed */
        183, #pragma acc loop seq
        186, #pragma acc loop seq
        189, #pragma acc loop seq
    178, Generating implicit copyin(max) [if not already present]
    180, Complex loop carried dependence of max-> prevents parallelization
         Loop carried scalar dependence for absMax at line 242
         Complex loop carried dependence of min-> prevents parallelization
         Generating implicit firstprivate(absMax,maxChIdx)
         Complex loop carried dependence of max-> prevents parallelization
    189, Generating implicit firstprivate(inDataVal,outBatchPitch,outImPitch,tensorScale,outdataOffset,outChPitch,inDataFloat,accScale)
    205, Accelerator restriction: induction variable live-out from loop: maxChIdx
    210, Accelerator restriction: induction variable live-out from loop: maxChIdx
void TIDL_refConv2d<signed char, short, long, unsigned char, long>(signed char*, short*, long*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, short, long, short, long>(signed char*, short*, long*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, short, long, unsigned short, long>(signed char*, short*, long*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, short, long, signed char, long>(unsigned char*, short*, long*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, unsigned char, short, long, long>(unsigned char*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, unsigned char, short, long, long>(unsigned char*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<unsigned char, short, long, long>(unsigned char const*, short const*, long const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<unsigned char, short, long, unsigned char, long>(unsigned char*, short*, long*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, short, long, short, long>(unsigned char*, short*, long*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, short, long, unsigned short, long>(unsigned char*, short*, long*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, short, long, signed char, long>(short*, short*, long*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, short, short, long, long>(short*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, short, short, long, long>(short*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<short, short, long, long>(short const*, short const*, long const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<short, short, long, unsigned char, long>(short*, short*, long*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, short, long, short, long>(short*, short*, long*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, short, long, unsigned short, long>(short*, short*, long*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, short, long, signed char, long>(unsigned short*, short*, long*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, unsigned short, short, long, long>(unsigned short*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, unsigned short, short, long, long>(unsigned short*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<unsigned short, short, long, long>(unsigned short const*, short const*, long const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<unsigned short, short, long, unsigned char, long>(unsigned short*, short*, long*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, short, long, short, long>(unsigned short*, short*, long*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, short, long, unsigned short, long>(unsigned short*, short*, long*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, short, int, signed char, long>(signed char*, short*, int*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, signed char, short, int, long>(signed char*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, signed char, short, int, long>(signed char*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<signed char, short, int, long>(signed char const*, short const*, int const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<signed char, short, int, unsigned char, long>(signed char*, short*, int*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, short, int, short, long>(signed char*, short*, int*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<signed char, short, int, unsigned short, long>(signed char*, short*, int*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, short, int, signed char, long>(unsigned char*, short*, int*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, unsigned char, short, int, long>(unsigned char*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, unsigned char, short, int, long>(unsigned char*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<unsigned char, short, int, long>(unsigned char const*, short const*, int const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<unsigned char, short, int, unsigned char, long>(unsigned char*, short*, int*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, short, int, short, long>(unsigned char*, short*, int*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned char, short, int, unsigned short, long>(unsigned char*, short*, int*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, short, int, signed char, long>(short*, short*, int*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, short, short, int, long>(short*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, short, short, int, long>(short*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<short, short, int, long>(short const*, short const*, int const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<short, short, int, unsigned char, long>(short*, short*, int*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, short, int, short, long>(short*, short*, int*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<short, short, int, unsigned short, long>(short*, short*, int*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, short, int, signed char, long>(unsigned short*, short*, int*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2dKernelFast<1, unsigned short, short, int, long>(unsigned short*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernelFast<3, unsigned short, short, int, long>(unsigned short*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        498,   /* blockIdx.x threadIdx.x collapsed */
        500,   /* blockIdx.x threadIdx.x collapsed */
        502,   /* blockIdx.x threadIdx.x collapsed */
        504,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        519, #pragma acc loop seq
        524, #pragma acc loop seq
        527, #pragma acc loop seq
    504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    519, Generating implicit firstprivate(coeffsHeight,coeffsWidth)
    527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2dKernel<unsigned short, short, int, long>(unsigned short const*, short const*, int const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1])
         Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels)
         Generating NVIDIA GPU code
        333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */
        335,   /* blockIdx.x threadIdx.x collapsed */
        337,   /* blockIdx.x threadIdx.x collapsed */
        339,   /* blockIdx.x threadIdx.x collapsed */
        341,   /* blockIdx.x threadIdx.x collapsed */
             Generating reduction(min:_min)
             Generating reduction(max:_max)
        355, #pragma acc loop seq
        360, #pragma acc loop seq
        363, #pragma acc loop seq
    341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch)
    363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth)
void TIDL_refConv2d<unsigned short, short, int, unsigned char, long>(unsigned short*, short*, int*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, short, int, short, long>(unsigned short*, short*, int*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
void TIDL_refConv2d<unsigned short, short, int, unsigned short, long>(unsigned short*, short*, int*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*):
    733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present]
         Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch])
    906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1])
         Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present]
    911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        913,   /* blockIdx.x threadIdx.x collapsed */
        915,   /* blockIdx.x threadIdx.x collapsed */
        917,   /* blockIdx.x threadIdx.x collapsed */
    911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift)
    961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        963,   /* blockIdx.x threadIdx.x collapsed */
        965,   /* blockIdx.x threadIdx.x collapsed */
        967,   /* blockIdx.x threadIdx.x collapsed */
    961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present]
    967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift)
std::fabs(float):
     78, include "tidl_alg_int.h"
          88, include "mmalib_cnn.h"
               26, include "MMALIB_CNN_convolve_col_smallNo_ixX_ixX_oxX.h"
                    25, include "MMALIB_types.h"
                         35, include "c7x.h"
                              67, include "vector.h"
                                   51, include "c7x_c_funcs.h"
                                        41, include "cmath"
                                             15, include "cmath"
                                                 242, Generating implicit acc routine seq
                                                      Generating acc routine seq
                                                      Generating NVIDIA GPU code
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacceWgemkX4NEtn.gpu -computecap 86 -ptx /tmp/nvacceWgem5n6NRPb.ptx -o /tmp/nvaccKWgeSc99ei3K.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_conv2d_base.c -sm 86 /tmp/nvaccKWgeSc99ei3K.bin -compute 86 /tmp/nvacceWgem5n6NRPb.ptx -o /tmp/nvacceWgemyLVfjWr.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++MTgeY_F0xL-i.ll -S -o /tmp/nvc++wTgec1FJu987.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++wTgec1FJu987.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_conv2d_base.obj
Unlinking /tmp/nvc++gTgesODz164r.il
Unlinking /tmp/nvc++2TgeIQkaSM-P.s
Unlinking /tmp/nvc++MTgeY_F0xL-i.ll
Unlinking /tmp/nvc++wTgec1FJu987.llvm
compiling src/tidl_crop.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_crop.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++F6geDTGXueWP.il src/tidl_crop.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_crop.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_crop.c -il /tmp/nvc++F6geDTGXueWP.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++F6geDTGXueWP.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_crop.obj' -asm /tmp/nvc++p6geTHz5hMEf.ll
void TIDL_refCrop<float, float>(float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int):
    118, Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present]
         Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(outHeight,numROIs,numChs,outWidth)
         Generating NVIDIA GPU code
        118, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        120,   /* blockIdx.x threadIdx.x collapsed */
        122,   /* blockIdx.x threadIdx.x collapsed */
        124,   /* blockIdx.x threadIdx.x collapsed */
    124, Generating implicit firstprivate(inChPitch,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outROIPitch,outPtrOffset)
void TIDL_refCrop<unsigned char, unsigned char>(unsigned char const*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int):
    118, Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present]
         Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(outWidth,outHeight,numROIs,numChs)
         Generating NVIDIA GPU code
        118, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        120,   /* blockIdx.x threadIdx.x collapsed */
        122,   /* blockIdx.x threadIdx.x collapsed */
        124,   /* blockIdx.x threadIdx.x collapsed */
    124, Generating implicit firstprivate(inChPitch,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outROIPitch,outPtrOffset)
void TIDL_refCrop<unsigned short, unsigned short>(unsigned short const*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int):
    118, Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present]
         Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(outHeight,numROIs,numChs,outWidth)
         Generating NVIDIA GPU code
        118, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        120,   /* blockIdx.x threadIdx.x collapsed */
        122,   /* blockIdx.x threadIdx.x collapsed */
        124,   /* blockIdx.x threadIdx.x collapsed */
    124, Generating implicit firstprivate(inChPitch,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outROIPitch,outPtrOffset)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacc29geIvlkLac3.gpu -computecap 86 -ptx /tmp/nvaccw9gecqwofWIR.ptx -o /tmp/nvaccg9gesqie2W4q.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_crop.c -sm 86 /tmp/nvaccg9gesqie2W4q.bin -compute 86 /tmp/nvaccw9gecqwofWIR.ptx -o /tmp/nvacc29geIskah2_O.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++p6geTHz5hMEf.ll -S -o /tmp/nvc++h6gevVfOTHKV.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++h6gevVfOTHKV.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_crop.obj
Unlinking /tmp/nvc++F6geDTGXueWP.il
Unlinking /tmp/nvc++x6gefzgjfiHU.s
Unlinking /tmp/nvc++p6geTHz5hMEf.ll
Unlinking /tmp/nvc++h6gevVfOTHKV.llvm
compiling src/tidl_custom_int.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_custom_int.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++GhheGPv7J_Wj.il src/tidl_custom_int.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_custom_int.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_custom_int.c -il /tmp/nvc++GhheGPv7J_Wj.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++GhheGPv7J_Wj.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_custom_int.obj' -asm /tmp/nvc++GhheGHzMH98i.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++GhheGHzMH98i.ll -S -o /tmp/nvc++GhheGTLPYQcx.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++GhheGTLPYQcx.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_custom_int.obj
Unlinking /tmp/nvc++GhheGPv7J_Wj.il
Unlinking /tmp/nvc++GhheG1cMQ54j.s
Unlinking /tmp/nvc++GhheGHzMH98i.ll
Unlinking /tmp/nvc++GhheGTLPYQcx.llvm
compiling src/tidl_dataConvert.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_dataConvert.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++HpheJvgdQqMM.il src/tidl_dataConvert.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_dataConvert.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_dataConvert.c -il /tmp/nvc++HpheJvgdQqMM.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++HpheJvgdQqMM.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_dataConvert.obj' -asm /tmp/nvc++XphetrbS4KwC.ll
void TIDL_TensorMinMaxinFloat<float>(float const*, sTIDL_DataParams_t const*, float, float, float*, float*):
    442, Generating implicit copyin(min) [if not already present]
         Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present]
         Generating NVIDIA GPU code
        442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        444,   /* blockIdx.x threadIdx.x collapsed */
        446,   /* blockIdx.x threadIdx.x collapsed */
        448,   /* blockIdx.x threadIdx.x collapsed */
        450,   /* blockIdx.x threadIdx.x collapsed */
        452,   /* blockIdx.x threadIdx.x collapsed */
    442, Generating implicit copyin(dataPrms,max) [if not already present]
    452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale)
void TIDL_TensorMinMaxinFloat<signed char>(signed char const*, sTIDL_DataParams_t const*, float, float, float*, float*):
    442, Generating implicit copyin(min) [if not already present]
         Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present]
         Generating NVIDIA GPU code
        442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        444,   /* blockIdx.x threadIdx.x collapsed */
        446,   /* blockIdx.x threadIdx.x collapsed */
        448,   /* blockIdx.x threadIdx.x collapsed */
        450,   /* blockIdx.x threadIdx.x collapsed */
        452,   /* blockIdx.x threadIdx.x collapsed */
    442, Generating implicit copyin(dataPrms,max) [if not already present]
    452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale)
void TIDL_TensorMinMaxinFloat<unsigned char>(unsigned char const*, sTIDL_DataParams_t const*, float, float, float*, float*):
    442, Generating implicit copyin(min) [if not already present]
         Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present]
         Generating NVIDIA GPU code
        442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        444,   /* blockIdx.x threadIdx.x collapsed */
        446,   /* blockIdx.x threadIdx.x collapsed */
        448,   /* blockIdx.x threadIdx.x collapsed */
        450,   /* blockIdx.x threadIdx.x collapsed */
        452,   /* blockIdx.x threadIdx.x collapsed */
    442, Generating implicit copyin(dataPrms,max) [if not already present]
    452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale)
void TIDL_TensorMinMaxinFloat<short>(short const*, sTIDL_DataParams_t const*, float, float, float*, float*):
    442, Generating implicit copyin(min) [if not already present]
         Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present]
         Generating NVIDIA GPU code
        442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        444,   /* blockIdx.x threadIdx.x collapsed */
        446,   /* blockIdx.x threadIdx.x collapsed */
        448,   /* blockIdx.x threadIdx.x collapsed */
        450,   /* blockIdx.x threadIdx.x collapsed */
        452,   /* blockIdx.x threadIdx.x collapsed */
    442, Generating implicit copyin(dataPrms,max) [if not already present]
    452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale)
void TIDL_TensorMinMaxinFloat<unsigned short>(unsigned short const*, sTIDL_DataParams_t const*, float, float, float*, float*):
    442, Generating implicit copyin(min) [if not already present]
         Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present]
         Generating NVIDIA GPU code
        442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        444,   /* blockIdx.x threadIdx.x collapsed */
        446,   /* blockIdx.x threadIdx.x collapsed */
        448,   /* blockIdx.x threadIdx.x collapsed */
        450,   /* blockIdx.x threadIdx.x collapsed */
        452,   /* blockIdx.x threadIdx.x collapsed */
    442, Generating implicit copyin(dataPrms,max) [if not already present]
    452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale)
void TIDL_TensorMinMaxinFloat<int>(int const*, sTIDL_DataParams_t const*, float, float, float*, float*):
    442, Generating implicit copyin(min) [if not already present]
         Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present]
         Generating NVIDIA GPU code
        442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        444,   /* blockIdx.x threadIdx.x collapsed */
        446,   /* blockIdx.x threadIdx.x collapsed */
        448,   /* blockIdx.x threadIdx.x collapsed */
        450,   /* blockIdx.x threadIdx.x collapsed */
        452,   /* blockIdx.x threadIdx.x collapsed */
    442, Generating implicit copyin(dataPrms,max) [if not already present]
    452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale)
void TIDL_TensorMinMaxinFloat<unsigned int>(unsigned int const*, sTIDL_DataParams_t const*, float, float, float*, float*):
    442, Generating implicit copyin(min) [if not already present]
         Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present]
         Generating NVIDIA GPU code
        442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        444,   /* blockIdx.x threadIdx.x collapsed */
        446,   /* blockIdx.x threadIdx.x collapsed */
        448,   /* blockIdx.x threadIdx.x collapsed */
        450,   /* blockIdx.x threadIdx.x collapsed */
        452,   /* blockIdx.x threadIdx.x collapsed */
    442, Generating implicit copyin(dataPrms,max) [if not already present]
    452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccjsheBr6js0E0.gpu -computecap 86 -ptx /tmp/nvacczsheldW7IjaM.ptx -o /tmp/nvaccHsheJCqkPvHN.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_dataConvert.c -sm 86 /tmp/nvaccHsheJCqkPvHN.bin -compute 86 /tmp/nvacczsheldW7IjaM.ptx -o /tmp/nvaccPshe7a0qqdmb.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++XphetrbS4KwC.ll -S -o /tmp/nvc++5pheRBRitJwq.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++5pheRBRitJwq.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_dataConvert.obj
Unlinking /tmp/nvc++HpheJvgdQqMM.il
Unlinking /tmp/nvc++Pphe7bzW7sER.s
Unlinking /tmp/nvc++XphetrbS4KwC.ll
Unlinking /tmp/nvc++5pheRBRitJwq.llvm
compiling src/tidl_deconv2d.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_deconv2d.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++RChebMdJaQgj.il src/tidl_deconv2d.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_deconv2d.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_deconv2d.c -il /tmp/nvc++RChebMdJaQgj.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++RChebMdJaQgj.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_deconv2d.obj' -asm /tmp/nvc++BCherhJYZkLl.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++BCherhJYZkLl.ll -S -o /tmp/nvc++ZChezbCaz87x.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++ZChezbCaz87x.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_deconv2d.obj
Unlinking /tmp/nvc++RChebMdJaQgj.il
Unlinking /tmp/nvc++dChejwa9rDnz.s
Unlinking /tmp/nvc++BCherhJYZkLl.ll
Unlinking /tmp/nvc++ZChezbCaz87x.llvm
compiling src/tidl_depthToSpace.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_depthToSpace.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++XKhetojn52pn.il src/tidl_depthToSpace.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_depthToSpace.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_depthToSpace.c -il /tmp/nvc++XKhetojn52pn.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++XKhetojn52pn.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_depthToSpace.obj' -asm /tmp/nvc++bKhedgRPib1C.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++bKhedgRPib1C.ll -S -o /tmp/nvc++jKheB2J7cFCy.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++jKheB2J7cFCy.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_depthToSpace.obj
Unlinking /tmp/nvc++XKhetojn52pn.il
Unlinking /tmp/nvc++5KheRaZskrnc.s
Unlinking /tmp/nvc++bKhedgRPib1C.ll
Unlinking /tmp/nvc++jKheB2J7cFCy.llvm
compiling src/tidl_detectionOutput.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_detectionOutput.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++RShebT00TLNV.il src/tidl_detectionOutput.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_detectionOutput.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_detectionOutput.c -il /tmp/nvc++RShebT00TLNV.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++RShebT00TLNV.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput.obj' -asm /tmp/nvc++BSher_SrbdFy.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++BSher_SrbdFy.ll -S -o /tmp/nvc++ZShezW04ltNX.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++ZShezW04ltNX.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput.obj
Unlinking /tmp/nvc++RShebT00TLNV.il
Unlinking /tmp/nvc++dShejv68Kg3j.s
Unlinking /tmp/nvc++BSher_SrbdFy.ll
Unlinking /tmp/nvc++ZShezW04ltNX.llvm
compiling src/tidl_detectionOutput_score.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_detectionOutput_score.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++P0he7a5SFwdx.il src/tidl_detectionOutput_score.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_detectionOutput_score.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_detectionOutput_score.c -il /tmp/nvc++P0he7a5SFwdx.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++P0he7a5SFwdx.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput_score.obj' -asm /tmp/nvc++50heRkkTZlvW.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++50heRkkTZlvW.ll -S -o /tmp/nvc++b0hedG7wju5h.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++b0hedG7wju5h.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput_score.obj
Unlinking /tmp/nvc++P0he7a5SFwdx.il
Unlinking /tmp/nvc++X0hetiHSgZii.s
Unlinking /tmp/nvc++50heRkkTZlvW.ll
Unlinking /tmp/nvc++b0hedG7wju5h.llvm
compiling src/tidl_device_functions.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_device_functions.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++M8heYWtG4_cV.il src/tidl_device_functions.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_device_functions.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_device_functions.c -il /tmp/nvc++M8heYWtG4_cV.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++M8heYWtG4_cV.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_device_functions.obj' -asm /tmp/nvc++g8hesVa-eU7Z.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++g8hesVa-eU7Z.ll -S -o /tmp/nvc++28heIPgYMWQS.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++28heIPgYMWQS.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_device_functions.obj
Unlinking /tmp/nvc++M8heYWtG4_cV.il
Unlinking /tmp/nvc++w8hec3XT5gS7.s
Unlinking /tmp/nvc++g8hesVa-eU7Z.ll
Unlinking /tmp/nvc++28heIPgYMWQS.llvm
compiling src/tidl_eltWise.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_eltWise.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++7eieXIWqqMaA.il src/tidl_eltWise.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_eltWise.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_eltWise.c -il /tmp/nvc++7eieXIWqqMaA.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++7eieXIWqqMaA.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_eltWise.obj' -asm /tmp/nvc++ReiebaC4dffT.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++ReiebaC4dffT.ll -S -o /tmp/nvc++deiejaC9jUe_.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++deiejaC9jUe_.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_eltWise.obj
Unlinking /tmp/nvc++7eieXIWqqMaA.il
Unlinking /tmp/nvc++teie54uGHVVR.s
Unlinking /tmp/nvc++ReiebaC4dffT.ll
Unlinking /tmp/nvc++deiejaC9jUe_.llvm
compiling src/tidl_flatten.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_flatten.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++8mie0U19FG9L.il src/tidl_flatten.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_flatten.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_flatten.c -il /tmp/nvc++8mie0U19FG9L.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++8mie0U19FG9L.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_flatten.obj' -asm /tmp/nvc++8mie0panCmCN.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++8mie0panCmCN.ll -S -o /tmp/nvc++CmieuocJUMAn.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++CmieuocJUMAn.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_flatten.obj
Unlinking /tmp/nvc++8mie0U19FG9L.il
Unlinking /tmp/nvc++CmieuLJ62m2h.s
Unlinking /tmp/nvc++8mie0panCmCN.ll
Unlinking /tmp/nvc++CmieuocJUMAn.llvm
compiling src/tidl_function_mapping.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_function_mapping.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++GuieGS2SohYm.il src/tidl_function_mapping.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_function_mapping.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_function_mapping.c -il /tmp/nvc++GuieGS2SohYm.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++GuieGS2SohYm.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_function_mapping.obj' -asm /tmp/nvc++GuieGuorsK85.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++GuieGuorsK85.ll -S -o /tmp/nvc++GuieG4alGlBR.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++GuieG4alGlBR.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_function_mapping.obj
Unlinking /tmp/nvc++GuieGS2SohYm.il
Unlinking /tmp/nvc++GuieGqSdFOsr.s
Unlinking /tmp/nvc++GuieGuorsK85.ll
Unlinking /tmp/nvc++GuieG4alGlBR.llvm
compiling src/tidl_gatherLayer.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_gatherLayer.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++aCieaB0_dRUO.il src/tidl_gatherLayer.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_gatherLayer.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_gatherLayer.c -il /tmp/nvc++aCieaB0_dRUO.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++aCieaB0_dRUO.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_gatherLayer.obj' -asm /tmp/nvc++aCiealwGbqXv.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++aCiealwGbqXv.ll -S -o /tmp/nvc++aCieaJhdsZL9.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++aCieaJhdsZL9.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_gatherLayer.obj
Unlinking /tmp/nvc++aCieaB0_dRUO.il
Unlinking /tmp/nvc++aCieaZxvkO00.s
Unlinking /tmp/nvc++aCiealwGbqXv.ll
Unlinking /tmp/nvc++aCieaJhdsZL9.llvm
compiling src/tidl_innerProduct.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_innerProduct.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++bKiedFzxj5P3.il src/tidl_innerProduct.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_innerProduct.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_innerProduct.c -il /tmp/nvc++bKiedFzxj5P3.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++bKiedFzxj5P3.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_innerProduct.obj' -asm /tmp/nvc++rKieZsq4yd1u.ll
long TIDL_openaccShiftRight<long>(long, int):
     71, include "tidl_innerProduct.h"
          73, include "tidl_alg_int.h"
             1432, Generating acc routine seq
                   Generating NVIDIA GPU code
long TIDL_openaccShiftRightImpl<long>(long, int):
     71, include "tidl_innerProduct.h"
          73, include "tidl_alg_int.h"
             1426, Generating acc routine seq
                   Generating NVIDIA GPU code
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccrNieZoyCAS4b.gpu -computecap 86 -ptx /tmp/nvaccHNieJUQBQlW7.ptx -o /tmp/nvaccPNie7IGTXOr3.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_innerProduct.c -sm 86 /tmp/nvaccPNie7IGTXOr3.bin -compute 86 /tmp/nvaccHNieJUQBQlW7.ptx -o /tmp/nvaccXNiet6eYyV_6.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++rKieZsq4yd1u.ll -S -o /tmp/nvc++zKielPYgY-sM.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++zKielPYgY-sM.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_innerProduct.obj
Unlinking /tmp/nvc++bKiedFzxj5P3.il
Unlinking /tmp/nvc++jKieBxf3A9vD.s
Unlinking /tmp/nvc++rKieZsq4yd1u.ll
Unlinking /tmp/nvc++zKielPYgY-sM.llvm
compiling src/tidl_layerNorm.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_layerNorm.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++uXie83VR3NXb.il src/tidl_layerNorm.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_layerNorm.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_layerNorm.c -il /tmp/nvc++uXie83VR3NXb.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++uXie83VR3NXb.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_layerNorm.obj' -asm /tmp/nvc++uXie8YLR1FpY.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++uXie8YLR1FpY.ll -S -o /tmp/nvc++0XieCkuIg-9o.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++0XieCkuIg-9o.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_layerNorm.obj
Unlinking /tmp/nvc++uXie83VR3NXb.il
Unlinking /tmp/nvc++0XieC3UdEHYb.s
Unlinking /tmp/nvc++uXie8YLR1FpY.ll
Unlinking /tmp/nvc++0XieCkuIg-9o.llvm
compiling src/tidl_odOutputReformat.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_odOutputReformat.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++j5ieBysi-ttD.il src/tidl_odOutputReformat.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_odOutputReformat.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_odOutputReformat.c -il /tmp/nvc++j5ieBysi-ttD.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++j5ieBysi-ttD.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_odOutputReformat.obj' -asm /tmp/nvc++z5iel6q4t-wj.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++z5iel6q4t-wj.ll -S -o /tmp/nvc++H5ieJl4kPyVB.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++H5ieJl4kPyVB.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_odOutputReformat.obj
Unlinking /tmp/nvc++j5ieBysi-ttD.il
Unlinking /tmp/nvc++r5ieZBfZMc0u.s
Unlinking /tmp/nvc++z5iel6q4t-wj.ll
Unlinking /tmp/nvc++H5ieJl4kPyVB.llvm
compiling src/tidl_pad.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_pad.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++zbjelSHyIg6n.il src/tidl_pad.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_pad.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_pad.c -il /tmp/nvc++zbjelSHyIg6n.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++zbjelSHyIg6n.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pad.obj' -asm /tmp/nvc++Pbje75ORX0fG.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++Pbje75ORX0fG.ll -S -o /tmp/nvc++XbjetjqbR5oz.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++XbjetjqbR5oz.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pad.obj
Unlinking /tmp/nvc++zbjelSHyIg6n.il
Unlinking /tmp/nvc++HbjeJnpTJXYH.s
Unlinking /tmp/nvc++Pbje75ORX0fG.ll
Unlinking /tmp/nvc++XbjetjqbR5oz.llvm
compiling src/tidl_pooling.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_pooling.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++Bjjer0-tWQnd.il src/tidl_pooling.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_pooling.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_pooling.c -il /tmp/nvc++Bjjer0-tWQnd.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++Bjjer0-tWQnd.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pooling.obj' -asm /tmp/nvc++ljjeHjXEJrH3.ll
void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling<unsigned char>(unsigned char*, int, int, int, int, int, int, unsigned char*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*):
    180, Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present]
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
    185, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        185, #pragma acc loop gang collapse(2) /* blockIdx.x */
        187,   /* blockIdx.x collapsed */
        191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(max:maxValue)
        193,   /* threadIdx.x collapsed */
    187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue)
    191, Loop is parallelizable
    193, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch)
void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling<signed char>(signed char*, int, int, int, int, int, int, signed char*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*):
    180, Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present]
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
    185, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        185, #pragma acc loop gang collapse(2) /* blockIdx.x */
        187,   /* blockIdx.x collapsed */
        191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(max:maxValue)
        193,   /* threadIdx.x collapsed */
    187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue)
    191, Loop is parallelizable
    193, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch)
void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling<unsigned short>(unsigned short*, int, int, int, int, int, int, unsigned short*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*):
    180, Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present]
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
    185, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        185, #pragma acc loop gang collapse(2) /* blockIdx.x */
        187,   /* blockIdx.x collapsed */
        191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(max:maxValue)
        193,   /* threadIdx.x collapsed */
    187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue)
    191, Loop is parallelizable
    193, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch)
void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling<short>(short*, int, int, int, int, int, int, short*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*):
    180, Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present]
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
    185, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        185, #pragma acc loop gang collapse(2) /* blockIdx.x */
        187,   /* blockIdx.x collapsed */
        191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(max:maxValue)
        193,   /* threadIdx.x collapsed */
    187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue)
    191, Loop is parallelizable
    193, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch)
void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling<float>(float*, int, int, int, int, int, int, float*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*):
    180, Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present]
    185, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        185, #pragma acc loop gang collapse(2) /* blockIdx.x */
        187,   /* blockIdx.x collapsed */
        191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(max:maxValue)
        193,   /* threadIdx.x collapsed */
    187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue)
    191, Loop is parallelizable
    193, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPoolingv2<signed char, signed char, int, unsigned char>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, signed char*, int*, unsigned char, int, unsigned char, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    261, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
    268, Generating implicit firstprivate(numOutChannels,width,numBatches,height)
         Generating NVIDIA GPU code
        268, #pragma acc loop gang collapse(2) /* blockIdx.x */
        270,   /* blockIdx.x collapsed */
        274, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        276,   /* threadIdx.x collapsed */
    270, Generating implicit firstprivate(sumBlock,scaleValue,result,outChPitch,max,biasTerm,outBatchPitch,min)
    274, Loop is parallelizable
    276, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    326, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        326, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        328,   /* blockIdx.x threadIdx.x collapsed */
    328, Generating implicit firstprivate(outBatchPitch,mmaShift,outChPitch,tempAcc,satLow,mixedPrecision,satHigh,result)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<unsigned char, unsigned char, unsigned int, unsigned int>(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, unsigned char*, unsigned int*, unsigned int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<unsigned char, signed char, int, int>(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, signed char*, int*, int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<unsigned char, unsigned short, unsigned long, unsigned long>(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, unsigned short*, unsigned long*, unsigned long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<unsigned char, short, long, long>(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, short*, long*, long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<signed char, unsigned char, unsigned int, unsigned int>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, unsigned char*, unsigned int*, unsigned int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<signed char, signed char, int, int>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, signed char*, int*, int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<signed char, unsigned short, unsigned long, unsigned long>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, unsigned short*, unsigned long*, unsigned long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<signed char, short, long, long>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, short*, long*, long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<unsigned short, unsigned char, unsigned int, unsigned int>(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, unsigned char*, unsigned int*, unsigned int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<unsigned short, signed char, int, int>(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, signed char*, int*, int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<unsigned short, unsigned short, unsigned long, unsigned long>(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, unsigned short*, unsigned long*, unsigned long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<unsigned short, short, long, long>(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, short*, long*, long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<short, unsigned char, unsigned int, unsigned int>(sTIDL_Network_t*, short*, int, int, int, int, int, int, unsigned char*, unsigned int*, unsigned int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<short, signed char, int, int>(sTIDL_Network_t*, short*, int, int, int, int, int, int, signed char*, int*, int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<short, unsigned short, unsigned long, unsigned long>(sTIDL_Network_t*, short*, int, int, int, int, int, int, unsigned short*, unsigned long*, unsigned long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<short, short, long, long>(sTIDL_Network_t*, short*, int, int, int, int, int, int, short*, long*, long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numOutChannels,numBatches)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling<float, float, float, float>(sTIDL_Network_t*, float*, int, int, int, int, int, int, float*, float*, float, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
    435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1])
         Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present]
         Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present]
    442, Generating implicit firstprivate(width,numOutChannels,numBatches,height)
         Generating NVIDIA GPU code
        442, #pragma acc loop gang collapse(2) /* blockIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
        444,   /* blockIdx.x collapsed */
        448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
             Generating reduction(+:sumBlock)
        450,   /* threadIdx.x collapsed */
    442, Generating implicit copy(max,min) [if not already present]
    444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch)
    448, Loop is parallelizable
    450, Loop is parallelizable
         Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch)
    513, Generating implicit firstprivate(numBatches,numOutChannels)
         Generating NVIDIA GPU code
        513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        515,   /* blockIdx.x threadIdx.x collapsed */
    513, Generating implicit copy(net) [if not already present]
    515, Generating implicit firstprivate(outBatchPitch,layerIdx,result,outChPitch)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<unsigned char, unsigned char, int>(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<unsigned char, signed char, int>(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, signed char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<unsigned char, unsigned short, long>(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<unsigned char, short, long>(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<signed char, unsigned char, int>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<signed char, signed char, int>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, int, int, int, int, int, int, signed char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<signed char, unsigned short, long>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<signed char, short, long>(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, int, int, int, int, int, int, short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<unsigned short, unsigned char, int>(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<unsigned short, signed char, int>(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, signed char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<unsigned short, unsigned short, long>(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<unsigned short, short, long>(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<short, unsigned char, int>(sTIDL_Network_t*, short*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<short, signed char, int>(sTIDL_Network_t*, short*, int, int, int, int, int, int, int, int, int, int, int, int, signed char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<short, unsigned short, long>(sTIDL_Network_t*, short*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<short, short, long>(sTIDL_Network_t*, short*, int, int, int, int, int, int, int, int, int, int, int, int, short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
         Generating copyin(inData[:]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(min,max) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits)
int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling<float, float, float>(sTIDL_Network_t*, float*, int, int, int, int, int, int, int, int, int, int, int, int, float*, float*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*):
   1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))])
         Generating copyin(inData[:]) [if not already present]
         Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present]
   1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols)
         Generating NVIDIA GPU code
       1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1237,   /* blockIdx.x threadIdx.x collapsed */
       1239,   /* blockIdx.x threadIdx.x collapsed */
       1241,   /* blockIdx.x threadIdx.x collapsed */
       1269, #pragma acc loop seq
       1272, #pragma acc loop seq
   1235, Generating implicit copy(max,min) [if not already present]
   1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,padW,outChPitch,outBatchPitch,kernelW,width)
   1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch)
   1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols)
         Generating NVIDIA GPU code
       1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
             Generating reduction(min:min)
             Generating reduction(max:max)
       1359,   /* blockIdx.x threadIdx.x collapsed */
       1361,   /* blockIdx.x threadIdx.x collapsed */
       1363,   /* blockIdx.x threadIdx.x collapsed */
   1357, Generating implicit copy(net,min,max) [if not already present]
   1363, Generating implicit firstprivate(outChPitch,outBatchPitch,layerIdx,result,outPitch)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccjmjeBVZUsold.gpu -computecap 86 -ptx /tmp/nvacczmjelbFOIdnm.ptx -o /tmp/nvaccHmjeJk3CPFv-.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_pooling.c -sm 86 /tmp/nvaccHmjeJk3CPFv-.bin -compute 86 /tmp/nvacczmjelbFOIdnm.ptx -o /tmp/nvaccPmje7Eq4qDHx.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++ljjeHjXEJrH3.ll -S -o /tmp/nvc++JjjePvAMPwDo.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++JjjePvAMPwDo.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pooling.obj
Unlinking /tmp/nvc++Bjjer0-tWQnd.il
Unlinking /tmp/nvc++ZjjezwEnboPn.s
Unlinking /tmp/nvc++ljjeHjXEJrH3.ll
Unlinking /tmp/nvc++JjjePvAMPwDo.llvm
compiling src/tidl_preEmption.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_preEmption.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++IwjeM0Q9XRsg.il src/tidl_preEmption.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_preEmption.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_preEmption.c -il /tmp/nvc++IwjeM0Q9XRsg.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++IwjeM0Q9XRsg.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_preEmption.obj' -asm /tmp/nvc++cwjegKmftH3v.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++cwjegKmftH3v.ll -S -o /tmp/nvc++swje2eX3WybW.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++swje2eX3WybW.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_preEmption.obj
Unlinking /tmp/nvc++IwjeM0Q9XRsg.il
Unlinking /tmp/nvc++YwjewzbKibwf.s
Unlinking /tmp/nvc++cwjegKmftH3v.ll
Unlinking /tmp/nvc++swje2eX3WybW.llvm
compiling src/tidl_reduce.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_reduce.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++LEjeVCJeoSiB.il src/tidl_reduce.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_reduce.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_reduce.c -il /tmp/nvc++LEjeVCJeoSiB.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++LEjeVCJeoSiB.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reduce.obj' -asm /tmp/nvc++1EjeFx3LD3eR.ll
TIDL_reduceProcessNew(TIDL_NetworkCommonParams*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, void**, void**, int):
    361, Generating copyout(outLastLinePtr[:algLayer->scratchSize-1]) [if not already present]
         Generating implicit copyin(algLayer) [if not already present]
         Generating NVIDIA GPU code
        365, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    372, Generating copyout(outLastLinePtr[:algLayer->scratchSize-1]) [if not already present]
         Generating implicit copyin(algLayer) [if not already present]
         Generating NVIDIA GPU code
        376, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore<signed char, signed char>(signed char*, signed char*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present]
         Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows)
         Generating NVIDIA GPU code
        173, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        175,   /* blockIdx.x threadIdx.x collapsed */
        177,   /* blockIdx.x threadIdx.x collapsed */
        183, #pragma acc loop seq
        194, #pragma acc loop seq
    173, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present]
    177, Generating implicit firstprivate(outPitch,targetVal,inChPitch)
    183, Scalar last value needed after loop for targetVal at line 202,196
         Generating implicit firstprivate(inPitch)
         Scalar last value needed after loop for targetVal at line 202
    194, Scalar last value needed after loop for targetVal at line 202
int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore<unsigned char, unsigned char>(unsigned char*, unsigned char*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present]
         Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows)
         Generating NVIDIA GPU code
        173, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        175,   /* blockIdx.x threadIdx.x collapsed */
        177,   /* blockIdx.x threadIdx.x collapsed */
        183, #pragma acc loop seq
        194, #pragma acc loop seq
    173, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present]
    177, Generating implicit firstprivate(outPitch,targetVal,inChPitch)
    183, Scalar last value needed after loop for targetVal at line 202,196
         Generating implicit firstprivate(inPitch)
         Scalar last value needed after loop for targetVal at line 202
    194, Scalar last value needed after loop for targetVal at line 202
int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore<short, short>(short*, short*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present]
         Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows)
         Generating NVIDIA GPU code
        173, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        175,   /* blockIdx.x threadIdx.x collapsed */
        177,   /* blockIdx.x threadIdx.x collapsed */
        183, #pragma acc loop seq
        194, #pragma acc loop seq
    173, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present]
    177, Generating implicit firstprivate(outPitch,targetVal,inChPitch)
    183, Scalar last value needed after loop for targetVal at line 202,196
         Generating implicit firstprivate(inPitch)
         Scalar last value needed after loop for targetVal at line 202
    194, Scalar last value needed after loop for targetVal at line 202
int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore<unsigned short, unsigned short>(unsigned short*, unsigned short*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present]
         Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows)
         Generating NVIDIA GPU code
        173, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        175,   /* blockIdx.x threadIdx.x collapsed */
        177,   /* blockIdx.x threadIdx.x collapsed */
        183, #pragma acc loop seq
        194, #pragma acc loop seq
    173, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present]
    177, Generating implicit firstprivate(outPitch,targetVal,inChPitch)
    183, Scalar last value needed after loop for targetVal at line 202,196
         Generating implicit firstprivate(inPitch)
         Scalar last value needed after loop for targetVal at line 202
    194, Scalar last value needed after loop for targetVal at line 202
int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore<float, float>(float*, float*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present]
         Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows)
         Generating NVIDIA GPU code
        173, #pragma acc loop gang collapse(3) /* blockIdx.x */
        175,   /* blockIdx.x collapsed */
        177,   /* blockIdx.x collapsed */
        183, #pragma acc loop vector(128) /* threadIdx.x */
             Generating implicit reduction(min:targetVal)
        194, #pragma acc loop vector(128) /* threadIdx.x */
             Generating implicit reduction(max:targetVal)
    173, Generating implicit copyin(params) [if not already present]
         Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present]
    177, Generating implicit firstprivate(outPitch,targetVal,inChPitch)
    183, Loop is parallelizable
         Generating implicit firstprivate(inPitch)
    194, Loop is parallelizable
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccQHje_ctQ5izU.gpu -computecap 86 -ptx /tmp/nvacckHjeEWgSzr0l.ptx -o /tmp/nvaccAHjeo085QCwX.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_reduce.c -sm 86 /tmp/nvaccAHjeo085QCwX.bin -compute 86 /tmp/nvacckHjeEWgSzr0l.ptx -o /tmp/nvaccQHje_vJEBak7.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++1EjeFx3LD3eR.ll -S -o /tmp/nvc++DEjextsJxG0D.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++DEjextsJxG0D.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reduce.obj
Unlinking /tmp/nvc++LEjeVCJeoSiB.il
Unlinking /tmp/nvc++nEjeNcFV-2F6.s
Unlinking /tmp/nvc++1EjeFx3LD3eR.ll
Unlinking /tmp/nvc++DEjextsJxG0D.llvm
compiling src/tidl_reshape.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_reshape.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++NRje1GVACMhE.il src/tidl_reshape.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_reshape.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_reshape.c -il /tmp/nvc++NRje1GVACMhE.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++NRje1GVACMhE.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reshape.obj' -asm /tmp/nvc++xRjefJinpYWB.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++xRjefJinpYWB.ll -S -o /tmp/nvc++pRjeT7SDvxox.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++pRjeT7SDvxox.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reshape.obj
Unlinking /tmp/nvc++NRje1GVACMhE.il
Unlinking /tmp/nvc++FRjeDNlaDTR0.s
Unlinking /tmp/nvc++xRjefJinpYWB.ll
Unlinking /tmp/nvc++pRjeT7SDvxox.llvm
compiling src/tidl_resize.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_resize.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++PZje7yYPYAFo.il src/tidl_resize.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_resize.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_resize.c -il /tmp/nvc++PZje7yYPYAFo.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++PZje7yYPYAFo.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_resize.obj' -asm /tmp/nvc++5ZjeRIXXb-Cc.ll
TIDL_resizeProcessSP(sTIDL_Layer_t*, void**, void*, sTIDL_DataParams_t**, sTIDL_DataParams_t*, unsigned char*):
    422, Generating copyout(out[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present]
         Generating copyin(in[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+(inPitch*(inHeight-1)))]) [if not already present]
    433, Generating implicit firstprivate(numBatches,outWidth,outHeight,numInChannels)
         Generating NVIDIA GPU code
        433, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        435,   /* blockIdx.x threadIdx.x collapsed */
        437,   /* blockIdx.x threadIdx.x collapsed */
        439,   /* blockIdx.x threadIdx.x collapsed */
    439, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inOffset,inHeight,outOffset,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth)
    472, Generating implicit firstprivate(numBatches,outWidth,outHeight,numInChannels)
         Generating NVIDIA GPU code
        472, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        474,   /* blockIdx.x threadIdx.x collapsed */
        476,   /* blockIdx.x threadIdx.x collapsed */
        478,   /* blockIdx.x threadIdx.x collapsed */
    478, Generating implicit firstprivate(hNext,hIdx,inChPitch,hLoc,w10,w01,wNext,wRatio,wIdx,w11,w00,outPitch,inHeight,inOffset,outChPitch,outOffset,inPitch,wLoc,hRatio,inBatchPitch,outBatchPitch,inWidth)
void TIDL_refResize<signed char>(signed char*, signed char*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, sTIDL_ResizeLayerParams_t*, sTIDL_DataParams_t*, TIDL_CreateParams const*):
    205, Generating copyin(pIn[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+((inHeight-1)*inPitch))]) [if not already present]
         Generating copy(pOut[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present]
    216, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches)
         Generating NVIDIA GPU code
        216, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        218,   /* blockIdx.x threadIdx.x collapsed */
        220,   /* blockIdx.x threadIdx.x collapsed */
        222,   /* blockIdx.x threadIdx.x collapsed */
    222, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inHeight,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth)
    265, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches)
         Generating NVIDIA GPU code
        265, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        267,   /* blockIdx.x threadIdx.x collapsed */
        269,   /* blockIdx.x threadIdx.x collapsed */
        271,   /* blockIdx.x threadIdx.x collapsed */
    265, Generating implicit copyin(params) [if not already present]
    271, Generating implicit firstprivate(hNext,hIdx,enableHClip,hLoc,w10,w01,wNext,wIdx,w11,w00,outPitch,inChPitch,outChPitch,inPitch,wLoc,inBatchPitch,heightResizeRatio,hRatio,widthResizeRatio,wRatio,outBatchPitch,inWidth)
void TIDL_refResize<unsigned char>(unsigned char*, unsigned char*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, sTIDL_ResizeLayerParams_t*, sTIDL_DataParams_t*, TIDL_CreateParams const*):
    205, Generating copyin(pIn[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+((inHeight-1)*inPitch))]) [if not already present]
         Generating copy(pOut[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present]
    216, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches)
         Generating NVIDIA GPU code
        216, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        218,   /* blockIdx.x threadIdx.x collapsed */
        220,   /* blockIdx.x threadIdx.x collapsed */
        222,   /* blockIdx.x threadIdx.x collapsed */
    222, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inHeight,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth)
    265, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches)
         Generating NVIDIA GPU code
        265, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        267,   /* blockIdx.x threadIdx.x collapsed */
        269,   /* blockIdx.x threadIdx.x collapsed */
        271,   /* blockIdx.x threadIdx.x collapsed */
    265, Generating implicit copyin(params) [if not already present]
    271, Generating implicit firstprivate(hNext,hIdx,enableHClip,hLoc,w10,w01,wNext,wIdx,w11,w00,outPitch,inChPitch,outChPitch,inPitch,wLoc,inBatchPitch,heightResizeRatio,hRatio,widthResizeRatio,wRatio,outBatchPitch,inWidth)
void TIDL_refResize<short>(short*, short*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, sTIDL_ResizeLayerParams_t*, sTIDL_DataParams_t*, TIDL_CreateParams const*):
    205, Generating copyin(pIn[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+((inHeight-1)*inPitch))]) [if not already present]
         Generating copy(pOut[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present]
    216, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches)
         Generating NVIDIA GPU code
        216, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        218,   /* blockIdx.x threadIdx.x collapsed */
        220,   /* blockIdx.x threadIdx.x collapsed */
        222,   /* blockIdx.x threadIdx.x collapsed */
    222, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inHeight,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth)
    265, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches)
         Generating NVIDIA GPU code
        265, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        267,   /* blockIdx.x threadIdx.x collapsed */
        269,   /* blockIdx.x threadIdx.x collapsed */
        271,   /* blockIdx.x threadIdx.x collapsed */
    265, Generating implicit copyin(params) [if not already present]
    271, Generating implicit firstprivate(hNext,hIdx,enableHClip,hLoc,w10,w01,wNext,wIdx,w11,w00,outPitch,inChPitch,outChPitch,inPitch,wLoc,heightResizeRatio,inBatchPitch,hRatio,widthResizeRatio,wRatio,outBatchPitch,inWidth)
void TIDL_refResize<unsigned short>(unsigned short*, unsigned short*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, sTIDL_ResizeLayerParams_t*, sTIDL_DataParams_t*, TIDL_CreateParams const*):
    205, Generating copyin(pIn[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+((inHeight-1)*inPitch))]) [if not already present]
         Generating copy(pOut[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present]
    216, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches)
         Generating NVIDIA GPU code
        216, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        218,   /* blockIdx.x threadIdx.x collapsed */
        220,   /* blockIdx.x threadIdx.x collapsed */
        222,   /* blockIdx.x threadIdx.x collapsed */
    222, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inHeight,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth)
    265, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches)
         Generating NVIDIA GPU code
        265, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        267,   /* blockIdx.x threadIdx.x collapsed */
        269,   /* blockIdx.x threadIdx.x collapsed */
        271,   /* blockIdx.x threadIdx.x collapsed */
    265, Generating implicit copyin(params) [if not already present]
    271, Generating implicit firstprivate(hNext,hIdx,enableHClip,hLoc,w10,w01,wNext,wIdx,w11,w00,outPitch,inChPitch,outChPitch,inPitch,wLoc,heightResizeRatio,inBatchPitch,hRatio,widthResizeRatio,wRatio,outBatchPitch,inWidth)
std::floor(float):
     71, include "math.h"
          15, include "math.h"
               36, include "cmath"
                    15, include "cmath"
                        261, Generating implicit acc routine seq
                             Generating acc routine seq
                             Generating NVIDIA GPU code
_INTERNAL_17_src_tidl_resize_c_33050338::TIDL_refResizeProcess(TIDL_CreateParams const*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, void**, void**, int):
    582, Generating copy(inPtrOrig[:resizeInWidthBytes+((inPitchBytes*(inputHeight+1))+((inElmtSize*leftPadResize)+((resizeInChPitchBytes*(resizeNumChannels-1))+(inBatchPitch*(numBatches-1)))))]) [if not already present]
         Generating implicit firstprivate(numBatches,resizeNumChannels,resizeInWidthBytes)
         Generating NVIDIA GPU code
        587, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        589,   /* blockIdx.x threadIdx.x collapsed */
        591,   /* blockIdx.x threadIdx.x collapsed */
    591, Generating implicit firstprivate(copyBottomLine,inBatchPitch,inElmtSize,inputHeight,resizeInChPitchBytes,leftPadResize,copyTopLine,inPitch,inPitchBytes)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccT2jehZBMwzdL.gpu -computecap 86 -ptx /tmp/nvacc92je3RlMMdtW.ptx -o /tmp/nvaccL2jeVkjknGlk.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_resize.c -sm 86 /tmp/nvaccL2jeVkjknGlk.bin -compute 86 /tmp/nvacc92je3RlMMdtW.ptx -o /tmp/nvaccn2jeNGYHuGhV.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++5ZjeRIXXb-Cc.ll -S -o /tmp/nvc++bZjed4DaBysa.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++bZjed4DaBysa.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_resize.obj
Unlinking /tmp/nvc++PZje7yYPYAFo.il
Unlinking /tmp/nvc++XZjetGLqZ340.s
Unlinking /tmp/nvc++5ZjeRIXXb-Cc.ll
Unlinking /tmp/nvc++bZjed4DaBysa.llvm
compiling src/tidl_roiPooling.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_roiPooling.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++RakebfOcarEz.il src/tidl_roiPooling.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_roiPooling.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_roiPooling.c -il /tmp/nvc++RakebfOcarEz.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++RakebfOcarEz.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_roiPooling.obj' -asm /tmp/nvc++BakerwUVYxnY.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++BakerwUVYxnY.ll -S -o /tmp/nvc++ZakezibhzNkA.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++ZakezibhzNkA.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_roiPooling.obj
Unlinking /tmp/nvc++RakebfOcarEz.il
Unlinking /tmp/nvc++dakejTi5rs88.s
Unlinking /tmp/nvc++BakerwUVYxnY.ll
Unlinking /tmp/nvc++ZakezibhzNkA.llvm
compiling src/tidl_scatterElements.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_scatterElements.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++ZikezUpqLNVx.il src/tidl_scatterElements.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_scatterElements.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_scatterElements.c -il /tmp/nvc++ZikezUpqLNVx.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++ZikezUpqLNVx.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_scatterElements.obj' -asm /tmp/nvc++JikePT5gjRTV.ll
void TIDL_refScatterElementsOutputUpdation<signed char, short>(short*, signed char*, int, int, int, short, short):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
void TIDL_refScatterElementsOutputUpdation<signed char, signed char>(signed char*, signed char*, int, int, int, signed char, signed char):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
void TIDL_refScatterElementsOutputUpdation<unsigned char, unsigned short>(unsigned short*, unsigned char*, int, int, int, unsigned short, unsigned short):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
void TIDL_refScatterElementsOutputUpdation<unsigned char, unsigned char>(unsigned char*, unsigned char*, int, int, int, unsigned char, unsigned char):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
void TIDL_refScatterElementsOutputUpdation<short, int>(int*, short*, int, int, int, int, int):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
void TIDL_refScatterElementsOutputUpdation<short, short>(short*, short*, int, int, int, short, short):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
void TIDL_refScatterElementsOutputUpdation<unsigned short, unsigned int>(unsigned int*, unsigned short*, int, int, int, unsigned int, unsigned int):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
void TIDL_refScatterElementsOutputUpdation<unsigned short, unsigned short>(unsigned short*, unsigned short*, int, int, int, unsigned short, unsigned short):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
void TIDL_refScatterElementsOutputUpdation<float, float>(float*, float*, int, int, int, float, float):
    119, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<signed char, int, signed char, short>(signed char*, int*, signed char*, signed char*, short*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating copyin(data[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<signed char, int, signed char, signed char>(signed char*, int*, signed char*, signed char*, signed char*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,targetIndex,w,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(maxValueAcc,minValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<unsigned char, int, unsigned char, unsigned short>(unsigned char*, int*, unsigned char*, unsigned char*, unsigned short*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,targetIndex,w,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating copyin(data[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,status,updateVal,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<unsigned char, int, unsigned char, unsigned char>(unsigned char*, int*, unsigned char*, unsigned char*, unsigned char*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,targetIndex,w,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating copyin(data[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(maxValueAcc,minValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<short, int, short, int>(short*, int*, short*, short*, int*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating copyin(data[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<short, int, short, short>(short*, int*, short*, short*, short*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<unsigned short, int, unsigned short, unsigned int>(unsigned short*, int*, unsigned short*, unsigned short*, unsigned int*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating copyin(data[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<unsigned short, int, unsigned short, unsigned short>(unsigned short*, int*, unsigned short*, unsigned short*, unsigned short*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<float, float, float, float>(float*, float*, float*, float*, float*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements<float, int, float, float>(float*, int*, float*, float*, float*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*):
    250, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows)
         Generating NVIDIA GPU code
        250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present]
         Generating implicit copyin(params,update) [if not already present]
    275, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    285, Generating copyin(data[:cpSize]) [if not already present]
         Generating copyout(accPtr[:cpSize]) [if not already present]
         Generating implicit firstprivate(cpSize)
         Generating NVIDIA GPU code
        288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    297, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        297, #pragma acc loop gang /* blockIdx.x */
        328, #pragma acc loop vector(128) /* threadIdx.x */
    297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    328, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    341, Generating implicit copyin(accPtr) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex)
         Generating NVIDIA GPU code
        341, #pragma acc loop gang /* blockIdx.x */
        356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */
        358,   /* threadIdx.x collapsed */
    341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present]
         Generating implicit copyin(update,params,outDataParams) [if not already present]
    356, Loop is parallelizable
    358, Loop is parallelizable
         Generating implicit firstprivate(minValueAcc,maxValueAcc)
    369, Generating copyin(accPtr[:outSize]) [if not already present]
         Generating copyout(output[:outSize]) [if not already present]
    381, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present]
    385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale)
    403, Generating implicit firstprivate(outSize)
         Generating NVIDIA GPU code
        405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
    429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present]
         Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present]
         Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels)
         Generating NVIDIA GPU code
        429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        431,   /* blockIdx.x threadIdx.x collapsed */
        433,   /* blockIdx.x threadIdx.x collapsed */
        435,   /* blockIdx.x threadIdx.x collapsed */
    435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels)
    483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present]
         Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present]
         Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows)
         Generating NVIDIA GPU code
        483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */
        485,   /* blockIdx.x threadIdx.x collapsed */
        487,   /* blockIdx.x threadIdx.x collapsed */
    487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccvlke-0Mp_EK_.gpu -computecap 86 -ptx /tmp/nvaccLlkeVdjvoll7.ptx -o /tmp/nvaccnlkeNHXK1Kei.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_scatterElements.c -sm 86 /tmp/nvaccnlkeNHXK1Kei.bin -compute 86 /tmp/nvaccLlkeVdjvoll7.ptx -o /tmp/nvacc1lkeFUDL8kjy.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++JikePT5gjRTV.ll -S -o /tmp/nvc++7ikeXANBtCw_.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++7ikeXANBtCw_.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_scatterElements.obj
Unlinking /tmp/nvc++ZikezUpqLNVx.il
Unlinking /tmp/nvc++likeHHT3Chi_.s
Unlinking /tmp/nvc++JikePT5gjRTV.ll
Unlinking /tmp/nvc++7ikeXANBtCw_.llvm
compiling src/tidl_shuffleChannel.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_shuffleChannel.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++gvkesqjKl2DY.il src/tidl_shuffleChannel.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_shuffleChannel.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_shuffleChannel.c -il /tmp/nvc++gvkesqjKl2DY.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++gvkesqjKl2DY.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_shuffleChannel.obj' -asm /tmp/nvc++MvkeYMTvSPoV.ll
void TIDL_refShuffleChannel<float>(float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int):
    123, Generating copy(pOut[:((height-1)*outLinePitch)+((numGroups*((NiPerG-1)*outChPitch))+(((numGroups-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))+1]) [if not already present]
         Generating copyin(pIn[:(inLinePitch*(height-1))+((inChPitch*(NiPerG-1))+((NiPerG*(inChPitch*(numGroups-1)))+((inROIPitch*(numROIs-1))+inPtrOffset)))+1]) [if not already present]
         Generating implicit firstprivate(NiPerG,height,numROIs,numGroups)
         Generating NVIDIA GPU code
        123, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        125,   /* blockIdx.x threadIdx.x collapsed */
        127,   /* blockIdx.x threadIdx.x collapsed */
        129,   /* blockIdx.x threadIdx.x collapsed */
    129, Generating implicit firstprivate(inChPitch,elemSize,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outPtrOffset,width,outROIPitch)
void TIDL_refShuffleChannel<unsigned char>(unsigned char const*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, int):
    123, Generating copy(pOut[:((height-1)*outLinePitch)+((numGroups*((NiPerG-1)*outChPitch))+(((numGroups-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))+1]) [if not already present]
         Generating copyin(pIn[:(inLinePitch*(height-1))+((inChPitch*(NiPerG-1))+((NiPerG*(inChPitch*(numGroups-1)))+((inROIPitch*(numROIs-1))+inPtrOffset)))+1]) [if not already present]
         Generating implicit firstprivate(NiPerG,height,numROIs,numGroups)
         Generating NVIDIA GPU code
        123, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        125,   /* blockIdx.x threadIdx.x collapsed */
        127,   /* blockIdx.x threadIdx.x collapsed */
        129,   /* blockIdx.x threadIdx.x collapsed */
    129, Generating implicit firstprivate(inChPitch,elemSize,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outPtrOffset,width,outROIPitch)
void TIDL_refShuffleChannel<unsigned short>(unsigned short const*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, int):
    123, Generating copy(pOut[:((height-1)*outLinePitch)+((numGroups*((NiPerG-1)*outChPitch))+(((numGroups-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))+1]) [if not already present]
         Generating copyin(pIn[:(inLinePitch*(height-1))+((inChPitch*(NiPerG-1))+((NiPerG*(inChPitch*(numGroups-1)))+((inROIPitch*(numROIs-1))+inPtrOffset)))+1]) [if not already present]
         Generating implicit firstprivate(NiPerG,height,numROIs,numGroups)
         Generating NVIDIA GPU code
        123, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */
        125,   /* blockIdx.x threadIdx.x collapsed */
        127,   /* blockIdx.x threadIdx.x collapsed */
        129,   /* blockIdx.x threadIdx.x collapsed */
    129, Generating implicit firstprivate(inChPitch,elemSize,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outPtrOffset,width,outROIPitch)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacc6ykeUZw8jBKW.gpu -computecap 86 -ptx /tmp/nvaccAykeoD3YPxgS.ptx -o /tmp/nvaccQyke_gZ_6u5R.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_shuffleChannel.c -sm 86 /tmp/nvaccQyke_gZ_6u5R.bin -compute 86 /tmp/nvaccAykeoD3YPxgS.ptx -o /tmp/nvacc6ykeU_lsR9dI.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++MvkeYMTvSPoV.ll -S -o /tmp/nvc++wvkecDAolXg5.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++wvkecDAolXg5.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_shuffleChannel.obj
Unlinking /tmp/nvc++gvkesqjKl2DY.il
Unlinking /tmp/nvc++2vkeIstGHQ-l.s
Unlinking /tmp/nvc++MvkeYMTvSPoV.ll
Unlinking /tmp/nvc++wvkecDAolXg5.llvm
compiling src/tidl_slice.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_slice.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++YIkewfBgboDR.il src/tidl_slice.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_slice.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_slice.c -il /tmp/nvc++YIkewfBgboDR.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++YIkewfBgboDR.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_slice.obj' -asm /tmp/nvc++sIke2WrAIh5F.ll
void TIDL_refSlice<float, float>(float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    124, Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present]
         Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(outHeight,outWidth,numROIs,numDim1,numChs,numDim2)
         Generating NVIDIA GPU code
        124, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        126,   /* blockIdx.x threadIdx.x collapsed */
        128,   /* blockIdx.x threadIdx.x collapsed */
        130,   /* blockIdx.x threadIdx.x collapsed */
        132,   /* blockIdx.x threadIdx.x collapsed */
        134,   /* blockIdx.x threadIdx.x collapsed */
    134, Generating implicit firstprivate(inDim2Pitch,inDim1Pitch,inChPitch,inLinePitch,inPtrOffset,outDim2Pitch,inROIPitch,outDim1Pitch,outChPitch,outLinePitch,outROIPitch,outPtrOffset)
void TIDL_refSlice<unsigned char, unsigned char>(unsigned char const*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    124, Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present]
         Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(outHeight,outWidth,numROIs,numDim1,numChs,numDim2)
         Generating NVIDIA GPU code
        124, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        126,   /* blockIdx.x threadIdx.x collapsed */
        128,   /* blockIdx.x threadIdx.x collapsed */
        130,   /* blockIdx.x threadIdx.x collapsed */
        132,   /* blockIdx.x threadIdx.x collapsed */
        134,   /* blockIdx.x threadIdx.x collapsed */
    134, Generating implicit firstprivate(inDim2Pitch,inDim1Pitch,inChPitch,inLinePitch,inPtrOffset,outDim2Pitch,inROIPitch,outDim1Pitch,outChPitch,outLinePitch,outROIPitch,outPtrOffset)
void TIDL_refSlice<unsigned short, unsigned short>(unsigned short const*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
    124, Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present]
         Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present]
         Generating implicit firstprivate(outHeight,outWidth,numROIs,numDim1,numChs,numDim2)
         Generating NVIDIA GPU code
        124, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */
        126,   /* blockIdx.x threadIdx.x collapsed */
        128,   /* blockIdx.x threadIdx.x collapsed */
        130,   /* blockIdx.x threadIdx.x collapsed */
        132,   /* blockIdx.x threadIdx.x collapsed */
        134,   /* blockIdx.x threadIdx.x collapsed */
    134, Generating implicit firstprivate(inDim2Pitch,inDim1Pitch,inChPitch,inLinePitch,inPtrOffset,outDim2Pitch,inROIPitch,outDim1Pitch,outChPitch,outLinePitch,outROIPitch,outPtrOffset)
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccOLke4r0CR1C0.gpu -computecap 86 -ptx /tmp/nvaccOLke4NkgR3Fq.ptx -o /tmp/nvaccOLke4MFRO0EW.bin -ftz -cuda12020
 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_slice.c -sm 86 /tmp/nvaccOLke4MFRO0EW.bin -compute 86 /tmp/nvaccOLke4NkgR3Fq.ptx -o /tmp/nvaccOLke4DCBjyvW.fat
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++sIke2WrAIh5F.ll -S -o /tmp/nvc++IIkeM3F9GyqY.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++IIkeM3F9GyqY.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_slice.obj
Unlinking /tmp/nvc++YIkewfBgboDR.il
Unlinking /tmp/nvc++cIkegWZ6yfxY.s
Unlinking /tmp/nvc++sIke2WrAIh5F.ll
Unlinking /tmp/nvc++IIkeM3F9GyqY.llvm
compiling src/tidl_softmax.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_softmax.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++0VkeCHKPxaXr.il src/tidl_softmax.c
"src/tidl_softmax.c", line 306: warning: variable "maxIndex" was set but never used [set_but_not_used]
    int32_t   maxIndex = 0;
              ^

Remark: individual warnings can be suppressed with "--diag_suppress <warning-name>"


/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_softmax.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_softmax.c -il /tmp/nvc++0VkeCHKPxaXr.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++0VkeCHKPxaXr.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_softmax.obj' -asm /tmp/nvc++0VkeCuoBug2A.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++0VkeCuoBug2A.ll -S -o /tmp/nvc++uVke83aFMJEE.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++uVke83aFMJEE.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_softmax.obj
Unlinking /tmp/nvc++0VkeCHKPxaXr.il
Unlinking /tmp/nvc++uVke8Sia_9gn.s
Unlinking /tmp/nvc++0VkeCuoBug2A.ll
Unlinking /tmp/nvc++uVke83aFMJEE.llvm
compiling src/tidl_squeeze.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_squeeze.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++43keOrNw7iqF.il src/tidl_squeeze.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_squeeze.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_squeeze.c -il /tmp/nvc++43keOrNw7iqF.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++43keOrNw7iqF.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_squeeze.obj' -asm /tmp/nvc++43keOtTF4jIB.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++43keOtTF4jIB.ll -S -o /tmp/nvc++43keOWx5QnI0.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++43keOWx5QnI0.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_squeeze.obj
Unlinking /tmp/nvc++43keOrNw7iqF.il
Unlinking /tmp/nvc++43keOEHNYObI.s
Unlinking /tmp/nvc++43keOtTF4jIB.ll
Unlinking /tmp/nvc++43keOWx5QnI0.llvm
compiling src/tidl_transpose.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/tidl_transpose.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++7-keXrgkqPA7.il src/tidl_transpose.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_transpose.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_transpose.c -il /tmp/nvc++7-keXrgkqPA7.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++7-keXrgkqPA7.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_transpose.obj' -asm /tmp/nvc++R-kebF8Cco9g.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++R-kebF8Cco9g.ll -S -o /tmp/nvc++d-kejxaMj5QL.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++d-kejxaMj5QL.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_transpose.obj
Unlinking /tmp/nvc++7-keXrgkqPA7.il
Unlinking /tmp/nvc++t-ke5FyeHmHk.s
Unlinking /tmp/nvc++R-kebF8Cco9g.ll
Unlinking /tmp/nvc++d-kejxaMj5QL.llvm
compiling src/workload_ref_exec.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
src/workload_ref_exec.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++_hle6PX8ULVx.il src/workload_ref_exec.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/workload_ref_exec.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/workload_ref_exec.c -il /tmp/nvc++_hle6PX8ULVx.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++_hle6PX8ULVx.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/workload_ref_exec.obj' -asm /tmp/nvc++EhleAc2ypyzY.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++EhleAc2ypyzY.ll -S -o /tmp/nvc++ohleQvMgT4jT.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++ohleQvMgT4jT.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/workload_ref_exec.obj
Unlinking /tmp/nvc++_hle6PX8ULVx.il
Unlinking /tmp/nvc++Uhlek5_zv0hJ.s
Unlinking /tmp/nvc++EhleAc2ypyzY.ll
Unlinking /tmp/nvc++ohleQvMgT4jT.llvm
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/printv.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg_utils.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_argmax.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchNorm.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchReshape.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_colorConversion.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_commonUtils.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_concat.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_const.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_conv2d_base.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_crop.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_custom_int.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_dataConvert.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_deconv2d.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_depthToSpace.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput_score.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_device_functions.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_eltWise.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_flatten.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_function_mapping.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_gatherLayer.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_innerProduct.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_layerNorm.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_odOutputReformat.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pad.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pooling.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_preEmption.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reduce.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reshape.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_resize.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_roiPooling.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_scatterElements.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_shuffleChannel.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_slice.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_softmax.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_squeeze.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_transpose.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/workload_ref_exec.obj
make[1]: Leaving directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo'
make -C ./ti_dl/algo/src/avx -f makefile
make[1]: Entering directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo/src/avx'
compiling tidl_avx.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidl_avx.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__AVX2__ -D__FMA__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../../inc -I../../../inc -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/packages -I/packages/ti/mathlib -I../../../../common -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/inc -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I../../../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv -I../../../../common -I../../../../common/c6xsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++QFle_E0oZnQy.il tidl_avx.c
"tidl_avx.c", line 306: warning: integer conversion resulted in a change of sign [integer_sign_change]
      mask[j]=1<<31;
              ^

Remark: individual warnings can be suppressed with "--diag_suppress <warning-name>"

"tidl_avx.c", line 237: warning: variable "i2" was declared but never referenced [declared_but_not_referenced]
    int32_t   i0, i2, i3, i4, i5, i6, i7, j;
                  ^

"tidl_avx.c", line 242: warning: variable "inData" was declared but never referenced [declared_but_not_referenced]
    float inData;
          ^

"tidl_avx.c", line 243: warning: variable "acc" was declared but never referenced [declared_but_not_referenced]
    float acc;
          ^

"tidl_avx.c", line 246: warning: variable "lmin" was declared but never referenced [declared_but_not_referenced]
    float lmin = *min;
          ^

"tidl_avx.c", line 247: warning: variable "lmax" was declared but never referenced [declared_but_not_referenced]
    float lmax = *max;
          ^

"tidl_avx.c", line 168: warning: integer conversion resulted in a change of sign [integer_sign_change]
      mask[j]=1<<31;
              ^

"tidl_avx.c", line 99: warning: variable "i2" was declared but never referenced [declared_but_not_referenced]
    int32_t   i0, i2, i3, i4, i5, i6, i7, j;
                  ^

"tidl_avx.c", line 104: warning: variable "inData" was declared but never referenced [declared_but_not_referenced]
    float inData;
          ^

"tidl_avx.c", line 105: warning: variable "acc" was declared but never referenced [declared_but_not_referenced]
    float acc;
          ^

"tidl_avx.c", line 108: warning: variable "lmin" was declared but never referenced [declared_but_not_referenced]
    float lmin = *min;
          ^

"tidl_avx.c", line 109: warning: variable "lmax" was declared but never referenced [declared_but_not_referenced]
    float lmax = *max;
          ^

"tidl_avx.c", line 1555: warning: variable "rem" was declared but never referenced [declared_but_not_referenced]
          int rem = (inWidth)%8; //Leftovers
              ^

"tidl_avx.c", line 1534: warning: variable "outOffset" was declared but never referenced [declared_but_not_referenced]
    int outOffset,outAcc;
        ^

"tidl_avx.c", line 1534: warning: variable "outAcc" was declared but never referenced [declared_but_not_referenced]
    int outOffset,outAcc;
                  ^

"tidl_avx.c", line 1535: warning: variable "i3" was declared but never referenced [declared_but_not_referenced]
    int i1,i2,i3,ik;
              ^

"tidl_avx.c", line 1535: warning: variable "ik" was declared but never referenced [declared_but_not_referenced]
    int i1,i2,i3,ik;
                 ^

"tidl_avx.c", line 1419: warning: variable "i3" was declared but never referenced [declared_but_not_referenced]
    int i1,i2,i3;
              ^

"tidl_avx.c", line 376: warning: variable "i2" was declared but never referenced [declared_but_not_referenced]
    int32_t   i0, i2, i3, i4, i5, i6, i7, j;
                  ^
          detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 719: warning: variable "i2" was declared but never referenced [declared_but_not_referenced]
    int32_t   i0, i2, i3, i4, i5, i6, i7, j;
                  ^
          detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1652
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 1067: warning: variable "i2" was declared but never referenced [declared_but_not_referenced]
    int32_t   i0, i2, i3, i4, i5, i6, i7, j;
                  ^
          detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1657
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1670
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1675
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 548: warning: variable "i2" was declared but never referenced [declared_but_not_referenced]
    int32_t   i0, i2, i3, i4, i5, i6, i7, j;
                  ^
          detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 894: warning: variable "i2" was declared but never referenced [declared_but_not_referenced]
    int32_t   i0, i2, i3, i4, i5, i6, i7, j;
                  ^
          detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1689
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 1242: warning: variable "i2" was declared but never referenced [declared_but_not_referenced]
    int32_t   i0, i2, i3, i4, i5, i6, i7, j;
                  ^
          detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1694
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1652
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1657
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1670
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1675
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745

"tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1689
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745

"tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1694
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1652
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1657
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1670
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1675
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837

"tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1689
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837

"tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1694
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1652
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1657
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1670
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1675
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855

"tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1689
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855

"tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1694
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1652
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1657
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1670
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1675
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875

"tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1689
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875

"tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1694
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1652
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1657
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1670
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1675
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894

"tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1689
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894

"tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1694
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1652
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1657
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1670
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1675
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913

"tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1689
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913

"tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1694
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1652
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1657
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932

"tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1670
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932

"tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit<Ksize,Tin,Tw,Tb,Tacc>(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1675
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932

"tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1689
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932

"tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large]
        mask[j]=((Tacc)1)<<63;
                           ^
          detected during:
            instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1694
            instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932


/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_avx.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_avx.c -il /tmp/nvc++QFle_E0oZnQy.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++QFle_E0oZnQy.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../../inc -I ../../../inc -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /packages -I /packages/ti/mathlib -I ../../../../common -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/inc -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I ../../../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -I ../../../../common -I ../../../../common/c6xsim -mavx -msse4.2 -msse4.1 -mssse3 -msse3 -msse2 -msse -mxsave -mavx2 -mavx -msse4.2 -msse4.1 -mssse3 -msse3 -msse2 -msse -mxsave -mfma -mavx -msse4.2 -msse4.1 -mssse3 -msse3 -msse2 -msse -mxsave -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/avx/./tidl_avx.obj' -asm /tmp/nvc++kFleEmB6qmN1.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++kFleEmB6qmN1.ll -S -o /tmp/nvc++AFleoW068JOW.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++AFleoW068JOW.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+avx2 -mattr=+fma -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/avx/./tidl_avx.obj
Unlinking /tmp/nvc++QFle_E0oZnQy.il
Unlinking /tmp/nvc++6FleUwyIET-r.s
Unlinking /tmp/nvc++kFleEmB6qmN1.ll
Unlinking /tmp/nvc++AFleoW068JOW.llvm
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/avx/./tidl_avx.obj
make[1]: Leaving directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo/src/avx'
.
======== MAKING TIDL PRIV ALGO =================
make -C ./ti_dl/algo/src/priv -f makefile
make[1]: Entering directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo/src/priv'
compiling tidl_model_play.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidl_model_play.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../../inc -I../../../inc -I../../../../common -I../../../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++M5leY_tMRCOB.il tidl_model_play.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_model_play.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_model_play.c -il /tmp/nvc++M5leY_tMRCOB.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++M5leY_tMRCOB.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../../inc -I ../../../inc -I ../../../../common -I ../../../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_model_play.obj' -asm /tmp/nvc++g5lesDsL2WKa.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++g5lesDsL2WKa.ll -S -o /tmp/nvc++25leIhMkOwHC.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++25leIhMkOwHC.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_model_play.obj
Unlinking /tmp/nvc++M5leY_tMRCOB.il
Unlinking /tmp/nvc++w5lec1ZJl4i-.s
Unlinking /tmp/nvc++g5lesDsL2WKa.ll
Unlinking /tmp/nvc++25leIhMkOwHC.llvm
compiling tidl_stalgo.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidl_stalgo.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../../inc -I../../../inc -I../../../../common -I../../../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++FbmeD7_JEsuG.il tidl_stalgo.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_stalgo.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_stalgo.c -il /tmp/nvc++FbmeD7_JEsuG.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++FbmeD7_JEsuG.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../../inc -I ../../../inc -I ../../../../common -I ../../../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo.obj' -asm /tmp/nvc++pbmeTpPXpitt.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++pbmeTpPXpitt.ll -S -o /tmp/nvc++hbmevntIRNr-.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++hbmevntIRNr-.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo.obj
Unlinking /tmp/nvc++FbmeD7_JEsuG.il
Unlinking /tmp/nvc++xbmefxwpAIlC.s
Unlinking /tmp/nvc++pbmeTpPXpitt.ll
Unlinking /tmp/nvc++hbmevntIRNr-.llvm
compiling tidl_stalgo_workload.c
Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2
Export NVHPC_CURRENT_CUDA_VERSION=12.2.53
Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7
Export PGI=/opt/nvidia/hpc_sdk
tidl_stalgo_workload.c:

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../../inc -I../../../inc -I../../../../common -I../../../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++Njme1YbcLACW.il tidl_stalgo_workload.c

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_stalgo_workload.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_stalgo_workload.c -il /tmp/nvc++Njme1YbcLACW.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++Njme1YbcLACW.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../../inc -I ../../../inc -I ../../../../common -I ../../../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo_workload.obj' -asm /tmp/nvc++xjmefvr7oMka.ll
NVC++/x86-64 Linux 23.7-0: compilation successful

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default<O3>' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++xjmefvr7oMka.ll -S -o /tmp/nvc++pjmeT978iBlr.llvm

/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++pjmeT978iBlr.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo_workload.obj
Unlinking /tmp/nvc++Njme1YbcLACW.il
Unlinking /tmp/nvc++FjmeDjWHDfP-.s
Unlinking /tmp/nvc++xjmefvr7oMka.ll
Unlinking /tmp/nvc++pjmeT978iBlr.llvm
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_model_play.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo.obj
r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo_workload.obj
make[1]: Leaving directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo/src/priv'
.
======== MAKING TIDL AND CUSTOM LIBRARIES =================