======== MAKING CUSTOM LIB ================= make -C ./ti_dl/custom -f makefile make[1]: Entering directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/custom' compiling tidl_custom.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidl_custom.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++LoeeV_QwCEZU.il tidl_custom.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_custom.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_custom.c -il /tmp/nvc++LoeeV_QwCEZU.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++LoeeV_QwCEZU.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom.obj' -asm /tmp/nvc++1oeeFzzHc1-6.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++1oeeFzzHc1-6.ll -S -o /tmp/nvc++DoeexLe7RX_h.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++DoeexLe7RX_h.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom.obj Unlinking /tmp/nvc++LoeeV_QwCEZU.il Unlinking /tmp/nvc++noeeN0IsCLBa.s Unlinking /tmp/nvc++1oeeFzzHc1-6.ll Unlinking /tmp/nvc++DoeexLe7RX_h.llvm compiling tidl_custom_maxpooling.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidl_custom_maxpooling.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++Owee4APrJ0oH.il tidl_custom_maxpooling.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_custom_maxpooling.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_custom_maxpooling.c -il /tmp/nvc++Owee4APrJ0oH.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++Owee4APrJ0oH.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom_maxpooling.obj' -asm /tmp/nvc++Owee4aB9X5Jf.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++Owee4aB9X5Jf.ll -S -o /tmp/nvc++Owee4hZ8buR3.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++Owee4hZ8buR3.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom_maxpooling.obj Unlinking /tmp/nvc++Owee4APrJ0oH.il Unlinking /tmp/nvc++Owee4X0YkVS1.s Unlinking /tmp/nvc++Owee4aB9X5Jf.ll Unlinking /tmp/nvc++Owee4hZ8buR3.llvm compiling tidsp/tidl_custom_maxpool_ixX_oxX.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidsp/tidl_custom_maxpool_ixX_oxX.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++nEeeN_q9jtyL.il tidsp/tidl_custom_maxpool_ixX_oxX.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidsp/tidl_custom_maxpool_ixX_oxX.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidsp/tidl_custom_maxpool_ixX_oxX.c -il /tmp/nvc++nEeeN_q9jtyL.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++nEeeN_q9jtyL.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX.obj' -asm /tmp/nvc++DEeexpF-ub3W.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++DEeexpF-ub3W.ll -S -o /tmp/nvc++fEeepk59nPHz.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++fEeepk59nPHz.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX.obj Unlinking /tmp/nvc++nEeeN_q9jtyL.il Unlinking /tmp/nvc++1EeeFZ4Hh1Gz.s Unlinking /tmp/nvc++DEeexpF-ub3W.ll Unlinking /tmp/nvc++fEeepk59nPHz.llvm compiling tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++cMeegc-VVWBZ.il tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidsp/tidl_custom_maxpool_ixX_oxX_c7x.c -il /tmp/nvc++cMeegc-VVWBZ.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++cMeegc-VVWBZ.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_c7x.obj' -asm /tmp/nvc++IMeeMPXq8XYj.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++IMeeMPXq8XYj.ll -S -o /tmp/nvc++YMeew2_VJ2A3.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++YMeew2_VJ2A3.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_c7x.obj Unlinking /tmp/nvc++cMeegc-VVWBZ.il Unlinking /tmp/nvc++sMee2s3451dA.s Unlinking /tmp/nvc++IMeeMPXq8XYj.ll Unlinking /tmp/nvc++YMeew2_VJ2A3.llvm compiling tidsp/tidl_custom_maxpool_ixX_oxX_cn.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidsp/tidl_custom_maxpool_ixX_oxX_cn.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../inc -I../utils/perfsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++pUeeTglS7OkS.il tidsp/tidl_custom_maxpool_ixX_oxX_cn.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidsp/tidl_custom_maxpool_ixX_oxX_cn.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidsp/tidl_custom_maxpool_ixX_oxX_cn.c -il /tmp/nvc++pUeeTglS7OkS.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++pUeeTglS7OkS.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../inc -I ../utils/perfsim -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_cn.obj' -asm /tmp/nvc++-Uee9Ed55VHz.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++-Uee9Ed55VHz.ll -S -o /tmp/nvc++3UeeLw-Cpy1d.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++3UeeLw-Cpy1d.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_cn.obj Unlinking /tmp/nvc++pUeeTglS7OkS.il Unlinking /tmp/nvc++hUeev8drKhG7.s Unlinking /tmp/nvc++-Uee9Ed55VHz.ll Unlinking /tmp/nvc++3UeeLw-Cpy1d.llvm r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/tidl_custom_maxpooling.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_c7x.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/custom/./tidsp/tidl_custom_maxpool_ixX_oxX_cn.obj make[1]: Leaving directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/custom' ======== MAKING TIDL ALGO ================= make -C ./ti_dl/algo -f makefile make[1]: Entering directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo' compiling src/printv.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/printv.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++TjfehdWpwH7G.il src/printv.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/printv.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/printv.c -il /tmp/nvc++TjfehdWpwH7G.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++TjfehdWpwH7G.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/printv.obj' -asm /tmp/nvc++9jfe37Z4LCWP.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++9jfe37Z4LCWP.ll -S -o /tmp/nvc++LjfeVAZXFyXa.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++LjfeVAZXFyXa.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/printv.obj Unlinking /tmp/nvc++TjfehdWpwH7G.il Unlinking /tmp/nvc++vjfe-Bq438AP.s Unlinking /tmp/nvc++9jfe37Z4LCWP.ll Unlinking /tmp/nvc++LjfeVAZXFyXa.llvm compiling src/tidl_alg.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_alg.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++VrfenVy2K3Yx.il src/tidl_alg.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_alg.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_alg.c -il /tmp/nvc++VrfenVy2K3Yx.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++VrfenVy2K3Yx.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg.obj' -asm /tmp/nvc++FrfeDVtux-J1.ll TIDL_activate(IALG_Obj*): 2489, Generating enter data create(scratch_buffer_base[:scratch_buffer_size]) 2495, Generating enter data copyin(coeff_buffer_base[:coeff_buffer_size]) TIDL_deactivate(IALG_Obj*): 2557, Generating exit data delete(scratch_buffer_base[:scratch_buffer_size]) 2568, Generating exit data delete(coeff_buffer_base[:coeff_buffer_size]) NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++FrfeDVtux-J1.ll -S -o /tmp/nvc++xrfefRrcDBT5.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++xrfefRrcDBT5.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg.obj Unlinking /tmp/nvc++VrfenVy2K3Yx.il Unlinking /tmp/nvc++Nrfe1O_CvlJL.s Unlinking /tmp/nvc++FrfeDVtux-J1.ll Unlinking /tmp/nvc++xrfefRrcDBT5.llvm compiling src/tidl_alg_utils.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_alg_utils.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++WzfeqF7IZVWD.il src/tidl_alg_utils.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_alg_utils.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_alg_utils.c -il /tmp/nvc++WzfeqF7IZVWD.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++WzfeqF7IZVWD.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg_utils.obj' -asm /tmp/nvc++WzfeqJs9XG1Q.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++WzfeqJs9XG1Q.ll -S -o /tmp/nvc++Wzfeq9MFIL2Z.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++Wzfeq9MFIL2Z.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg_utils.obj Unlinking /tmp/nvc++WzfeqF7IZVWD.il Unlinking /tmp/nvc++WzfeqzxY6ceQ.s Unlinking /tmp/nvc++WzfeqJs9XG1Q.ll Unlinking /tmp/nvc++Wzfeq9MFIL2Z.llvm compiling src/tidl_argmax.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_argmax.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++ZHfezB2aiwr6.il src/tidl_argmax.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_argmax.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_argmax.c -il /tmp/nvc++ZHfezB2aiwr6.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++ZHfezB2aiwr6.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_argmax.obj' -asm /tmp/nvc++JHfePQ4N7KBJ.ll int TIDL_argmaxRefProcess(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, signed char*, unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int): 127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(imHeight,numTotRoi,imWidth) Generating NVIDIA GPU code 127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 129, /* blockIdx.x threadIdx.x collapsed */ 131, /* blockIdx.x threadIdx.x collapsed */ 135, #pragma acc loop seq 127, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present] 131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx) 135, Loop carried scalar dependence for maxVal at line 145 Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch) Loop carried scalar dependence for maxVal at line 145 149, Accelerator restriction: induction variable live-out from loop: maxIdx int TIDL_argmaxRefProcess(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, unsigned char*, unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int): 127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(imHeight,numTotRoi,imWidth) Generating NVIDIA GPU code 127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 129, /* blockIdx.x threadIdx.x collapsed */ 131, /* blockIdx.x threadIdx.x collapsed */ 135, #pragma acc loop seq 127, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present] 131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx) 135, Loop carried scalar dependence for maxVal at line 145 Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch) Loop carried scalar dependence for maxVal at line 145 149, Accelerator restriction: induction variable live-out from loop: maxIdx int TIDL_argmaxRefProcess(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, short*, unsigned short*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int): 127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(imHeight,numTotRoi,imWidth) Generating NVIDIA GPU code 127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 129, /* blockIdx.x threadIdx.x collapsed */ 131, /* blockIdx.x threadIdx.x collapsed */ 135, #pragma acc loop seq 127, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present] 131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx) 135, Loop carried scalar dependence for maxVal at line 145 Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch) Loop carried scalar dependence for maxVal at line 145 149, Accelerator restriction: induction variable live-out from loop: maxIdx int TIDL_argmaxRefProcess(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, unsigned short*, unsigned short*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int): 127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(imHeight,numTotRoi,imWidth) Generating NVIDIA GPU code 127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 129, /* blockIdx.x threadIdx.x collapsed */ 131, /* blockIdx.x threadIdx.x collapsed */ 135, #pragma acc loop seq 127, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present] 131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx) 135, Loop carried scalar dependence for maxVal at line 145 Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch) Loop carried scalar dependence for maxVal at line 145 149, Accelerator restriction: induction variable live-out from loop: maxIdx int TIDL_argmaxRefProcess(sTIDL_AlgLayer_t*, sTIDL_ArgMaxParams_t*, sTIDL_DataParams_t*, sTIDL_DataParams_t*, float*, float*, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int): 127, Generating copyin(inData[:imWidth+((inPitch*(imHeight-1))+((inChPitch*(params->numChannels-1))+(inChPitch*(params->numChannels*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(imHeight,numTotRoi,imWidth) Generating NVIDIA GPU code 127, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 129, /* blockIdx.x threadIdx.x collapsed */ 131, /* blockIdx.x threadIdx.x collapsed */ 135, #pragma acc loop seq 127, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:imWidth+(((imHeight-1)*outPitch)+((numTotRoi-1)*outChPitch))]) [if not already present] 131, Generating implicit firstprivate(outPitch,outChPitch,maxIdx) 135, Loop carried scalar dependence for maxVal at line 145 Generating implicit firstprivate(inChPitch,currVal,maxVal,inPitch) Loop carried scalar dependence for maxVal at line 145 149, Accelerator restriction: induction variable live-out from loop: maxIdx /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccDKfexKywgT5i.gpu -computecap 86 -ptx /tmp/nvaccTKfehbgcwdcy.ptx -o /tmp/nvaccvKfe-jw99DYx.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_argmax.c -sm 86 /tmp/nvaccvKfe-jw99DYx.bin -compute 86 /tmp/nvaccTKfehbgcwdcy.ptx -o /tmp/nvacc9Kfe3zNMemMr.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++JHfePQ4N7KBJ.ll -S -o /tmp/nvc++7HfeX-DKbBmR.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++7HfeX-DKbBmR.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_argmax.obj Unlinking /tmp/nvc++ZHfezB2aiwr6.il Unlinking /tmp/nvc++lHfeH2jnP4pn.s Unlinking /tmp/nvc++JHfePQ4N7KBJ.ll Unlinking /tmp/nvc++7HfeX-DKbBmR.llvm compiling src/tidl_batchNorm.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_batchNorm.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++0UfeCh8txGPW.il src/tidl_batchNorm.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_batchNorm.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_batchNorm.c -il /tmp/nvc++0UfeCh8txGPW.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++0UfeCh8txGPW.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchNorm.obj' -asm /tmp/nvc++0UfeCAfdvcnY.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++0UfeCAfdvcnY.ll -S -o /tmp/nvc++uUfe8TWfMFpY.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++uUfe8TWfMFpY.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchNorm.obj Unlinking /tmp/nvc++0UfeCh8txGPW.il Unlinking /tmp/nvc++uUfe8cD9_fkL.s Unlinking /tmp/nvc++0UfeCAfdvcnY.ll Unlinking /tmp/nvc++uUfe8TWfMFpY.llvm compiling src/tidl_batchReshape.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_batchReshape.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++42feO3YM4OxS.il src/tidl_batchReshape.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_batchReshape.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_batchReshape.c -il /tmp/nvc++42feO3YM4OxS.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++42feO3YM4OxS.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchReshape.obj' -asm /tmp/nvc++42feOzy_5jgx.ll void TIDL_refBatchReshape(float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int): 119, Generating copyin(pIn[:width+((inLinePitch*(height-1))+((inChPitch*(numChs-1))+((inBatchPitch*(numBatches-1))+inPtrOffset)))]) [if not already present] Generating copy(pOut[:width+(((height-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+outPtrOffset)))]) [if not already present] Generating implicit firstprivate(numChs,numBatches,height,width) Generating NVIDIA GPU code 119, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 121, /* blockIdx.x threadIdx.x collapsed */ 123, /* blockIdx.x threadIdx.x collapsed */ 125, /* blockIdx.x threadIdx.x collapsed */ 125, Generating implicit firstprivate(inChPitch,inBatchPitch,inLinePitch,outChPitch,outBatchPitch,outPtrOffset,outLinePitch,inPtrOffset) void TIDL_refBatchReshape(unsigned char const*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int): 119, Generating copyin(pIn[:width+((inLinePitch*(height-1))+((inChPitch*(numChs-1))+((inBatchPitch*(numBatches-1))+inPtrOffset)))]) [if not already present] Generating copy(pOut[:width+(((height-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+outPtrOffset)))]) [if not already present] Generating implicit firstprivate(numChs,width,numBatches,height) Generating NVIDIA GPU code 119, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 121, /* blockIdx.x threadIdx.x collapsed */ 123, /* blockIdx.x threadIdx.x collapsed */ 125, /* blockIdx.x threadIdx.x collapsed */ 125, Generating implicit firstprivate(inChPitch,inBatchPitch,inLinePitch,outChPitch,outBatchPitch,outPtrOffset,outLinePitch,inPtrOffset) void TIDL_refBatchReshape(unsigned short const*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int): 119, Generating copyin(pIn[:width+((inLinePitch*(height-1))+((inChPitch*(numChs-1))+((inBatchPitch*(numBatches-1))+inPtrOffset)))]) [if not already present] Generating copy(pOut[:width+(((height-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+outPtrOffset)))]) [if not already present] Generating implicit firstprivate(numChs,numBatches,height,width) Generating NVIDIA GPU code 119, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 121, /* blockIdx.x threadIdx.x collapsed */ 123, /* blockIdx.x threadIdx.x collapsed */ 125, /* blockIdx.x threadIdx.x collapsed */ 125, Generating implicit firstprivate(inChPitch,inBatchPitch,inLinePitch,outChPitch,outBatchPitch,outPtrOffset,outLinePitch,inPtrOffset) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacc05feC3FjxMFt.gpu -computecap 86 -ptx /tmp/nvacc05feCkv1xF-g.ptx -o /tmp/nvaccu5fe8D_j0z78.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_batchReshape.c -sm 86 /tmp/nvaccu5fe8D_j0z78.bin -compute 86 /tmp/nvacc05feCkv1xF-g.ptx -o /tmp/nvacc05feCnYt1Pws.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++42feOzy_5jgx.ll -S -o /tmp/nvc++42feOMBnRDCq.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++42feOMBnRDCq.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchReshape.obj Unlinking /tmp/nvc++42feO3YM4OxS.il Unlinking /tmp/nvc++42feO0j5Zys3.s Unlinking /tmp/nvc++42feOzy_5jgx.ll Unlinking /tmp/nvc++42feOMBnRDCq.llvm compiling src/tidl_colorConversion.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_colorConversion.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++IdgeMRKpCyxR.il src/tidl_colorConversion.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_colorConversion.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_colorConversion.c -il /tmp/nvc++IdgeMRKpCyxR.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++IdgeMRKpCyxR.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_colorConversion.obj' -asm /tmp/nvc++cdgeglBVGWO8.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++cdgeglBVGWO8.ll -S -o /tmp/nvc++sdge2J0tIrL7.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++sdge2J0tIrL7.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_colorConversion.obj Unlinking /tmp/nvc++IdgeMRKpCyxR.il Unlinking /tmp/nvc++YdgewikL9iN3.s Unlinking /tmp/nvc++cdgeglBVGWO8.ll Unlinking /tmp/nvc++sdge2J0tIrL7.llvm compiling src/tidl_commonUtils.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_commonUtils.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++-lge97wn0hEL.il src/tidl_commonUtils.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_commonUtils.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_commonUtils.c -il /tmp/nvc++-lge97wn0hEL.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++-lge97wn0hEL.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_commonUtils.obj' -asm /tmp/nvc++Vlgen4bPMJF8.ll TIDL_getSaturationFloat(sTIDL_Layer_t*, float*, float*): 2968, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code TIDL_floatSat(float, sTIDL_Layer_t*): 3013, Generating acc routine seq Generating NVIDIA GPU code TIDL_checkPixelInPadRegion(int, int, int, int, int, int): 3267, Generating acc routine seq Generating NVIDIA GPU code TIDL_convertFloatToScaleAndShift(float, int*, int*, int): 3390, Generating acc routine seq Generating NVIDIA GPU code /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacc7ogeXQnwqa5U.gpu -computecap 86 -ptx /tmp/nvaccRogebitYayjx.ptx -o /tmp/nvaccdogejimgBy0u.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_commonUtils.c -sm 86 /tmp/nvaccdogejimgBy0u.bin -compute 86 /tmp/nvaccRogebitYayjx.ptx -o /tmp/nvaccBogerjnfsA3J.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++Vlgen4bPMJF8.ll -S -o /tmp/nvc++Nlge15Wen5OP.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++Nlge15Wen5OP.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_commonUtils.obj Unlinking /tmp/nvc++-lge97wn0hEL.il Unlinking /tmp/nvc++3lgeLBUZLzrw.s Unlinking /tmp/nvc++Vlgen4bPMJF8.ll Unlinking /tmp/nvc++Nlge15Wen5OP.llvm compiling src/tidl_concat.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_concat.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++dygejyCDyszz.il src/tidl_concat.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_concat.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_concat.c -il /tmp/nvc++dygejyCDyszz.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++dygejyCDyszz.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_concat.obj' -asm /tmp/nvc++ZygezJ6dl7dc.ll void TIDL_refConcat(signed char*, int*, int, tidlConcatBuffParams_t*): 156, Generating implicit copyin(concatBuffParams) [if not already present] Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present] Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))]) Generating NVIDIA GPU code 156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 158, /* blockIdx.x threadIdx.x collapsed */ 160, /* blockIdx.x threadIdx.x collapsed */ 160, Generating implicit firstprivate(scale) void TIDL_refConcat(unsigned char*, int*, int, tidlConcatBuffParams_t*): 156, Generating implicit copyin(concatBuffParams) [if not already present] Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present] Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))]) Generating NVIDIA GPU code 156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 158, /* blockIdx.x threadIdx.x collapsed */ 160, /* blockIdx.x threadIdx.x collapsed */ 160, Generating implicit firstprivate(scale) void TIDL_refConcat(short*, int*, int, tidlConcatBuffParams_t*): 156, Generating implicit copyin(concatBuffParams) [if not already present] Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present] Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))]) Generating NVIDIA GPU code 156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 158, /* blockIdx.x threadIdx.x collapsed */ 160, /* blockIdx.x threadIdx.x collapsed */ 160, Generating implicit firstprivate(scale) void TIDL_refConcat(unsigned short*, int*, int, tidlConcatBuffParams_t*): 156, Generating implicit copyin(concatBuffParams) [if not already present] Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present] Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))]) Generating NVIDIA GPU code 156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 158, /* blockIdx.x threadIdx.x collapsed */ 160, /* blockIdx.x threadIdx.x collapsed */ 160, Generating implicit firstprivate(scale) void TIDL_refConcat(float*, float*, int, tidlConcatBuffParams_t*): 156, Generating implicit copyin(concatBuffParams) [if not already present] Generating copyin(pIn[:concatBuffParams->inWidth+((concatBuffParams->inPitch*(concatBuffParams->inHeight-1))+(concatBuffParams->inChPitch*(concatBuffParams->numInChannels-1)))]) [if not already present] Generating present(pAcc[:concatBuffParams->inWidth+(((concatBuffParams->inHeight-1)*concatBuffParams->outPitch)+((concatBuffParams->numInChannels-1)*concatBuffParams->outChPitch))]) Generating NVIDIA GPU code 156, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 158, /* blockIdx.x threadIdx.x collapsed */ 160, /* blockIdx.x threadIdx.x collapsed */ 160, Generating implicit firstprivate(scale) void TIDL_refConcatQuantize(TIDL_Obj*, int, int*, signed char*, int, tidlConcatBuffParams_t*, int, int): 210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating NVIDIA GPU code 210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 212, /* blockIdx.x threadIdx.x collapsed */ 214, /* blockIdx.x threadIdx.x collapsed */ 210, Generating implicit copy(max,min) [if not already present] Generating implicit copyin(concatBuffParams) [if not already present] 214, Generating implicit firstprivate(outAcc) 243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present] Generating implicit copy(net) [if not already present] Generating NVIDIA GPU code 243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 245, /* blockIdx.x threadIdx.x collapsed */ 247, /* blockIdx.x threadIdx.x collapsed */ 243, Generating implicit copyin(concatBuffParams) [if not already present] 247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits) void TIDL_refConcatQuantize(TIDL_Obj*, int, int*, unsigned char*, int, tidlConcatBuffParams_t*, int, int): 210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating NVIDIA GPU code 210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 212, /* blockIdx.x threadIdx.x collapsed */ 214, /* blockIdx.x threadIdx.x collapsed */ 210, Generating implicit copy(max,min) [if not already present] Generating implicit copyin(concatBuffParams) [if not already present] 214, Generating implicit firstprivate(outAcc) 243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present] Generating implicit copy(net) [if not already present] Generating NVIDIA GPU code 243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 245, /* blockIdx.x threadIdx.x collapsed */ 247, /* blockIdx.x threadIdx.x collapsed */ 243, Generating implicit copyin(concatBuffParams) [if not already present] 247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits) void TIDL_refConcatQuantize(TIDL_Obj*, int, int*, short*, int, tidlConcatBuffParams_t*, int, int): 210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating NVIDIA GPU code 210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 212, /* blockIdx.x threadIdx.x collapsed */ 214, /* blockIdx.x threadIdx.x collapsed */ 210, Generating implicit copy(max,min) [if not already present] Generating implicit copyin(concatBuffParams) [if not already present] 214, Generating implicit firstprivate(outAcc) 243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present] Generating implicit copy(net) [if not already present] Generating NVIDIA GPU code 243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 245, /* blockIdx.x threadIdx.x collapsed */ 247, /* blockIdx.x threadIdx.x collapsed */ 243, Generating implicit copyin(concatBuffParams) [if not already present] 247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits) void TIDL_refConcatQuantize(TIDL_Obj*, int, int*, unsigned short*, int, tidlConcatBuffParams_t*, int, int): 210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating NVIDIA GPU code 210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 212, /* blockIdx.x threadIdx.x collapsed */ 214, /* blockIdx.x threadIdx.x collapsed */ 210, Generating implicit copy(max,min) [if not already present] Generating implicit copyin(concatBuffParams) [if not already present] 214, Generating implicit firstprivate(outAcc) 243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present] Generating implicit copy(net) [if not already present] Generating NVIDIA GPU code 243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 245, /* blockIdx.x threadIdx.x collapsed */ 247, /* blockIdx.x threadIdx.x collapsed */ 243, Generating implicit copyin(concatBuffParams) [if not already present] 247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits) void TIDL_refConcatQuantize(TIDL_Obj*, int, float*, float*, int, tidlConcatBuffParams_t*, int, int): 210, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating NVIDIA GPU code 210, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 212, /* blockIdx.x threadIdx.x collapsed */ 214, /* blockIdx.x threadIdx.x collapsed */ 210, Generating implicit copy(max,min) [if not already present] Generating implicit copyin(concatBuffParams) [if not already present] 214, Generating implicit firstprivate(outAcc) 243, Generating present(pAcc[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) Generating copy(pout[:concatBuffParams->outWidth+((concatBuffParams->outPitch*(concatBuffParams->outHeight-1))+(concatBuffParams->outChPitch*(concatBuffParams->numOutChannels-1)))]) [if not already present] Generating implicit copy(net) [if not already present] Generating NVIDIA GPU code 243, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 245, /* blockIdx.x threadIdx.x collapsed */ 247, /* blockIdx.x threadIdx.x collapsed */ 243, Generating implicit copyin(concatBuffParams) [if not already present] 247, Generating implicit firstprivate(layerIdx,elementtype,outAcc,satLow,satHigh,roundBits) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccKBgeS2A_hKqM.gpu -computecap 86 -ptx /tmp/nvaccKBgeSn-JhP9g.ptx -o /tmp/nvacceBgem74LKXQK.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_concat.c -sm 86 /tmp/nvacceBgem74LKXQK.bin -compute 86 /tmp/nvaccKBgeSn-JhP9g.ptx -o /tmp/nvaccKBgeSuahL-aG.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++ZygezJ6dl7dc.ll -S -o /tmp/nvc++lygeH_UOXbJH.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++lygeH_UOXbJH.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_concat.obj Unlinking /tmp/nvc++dygejyCDyszz.il Unlinking /tmp/nvc++BygerzFw5Asu.s Unlinking /tmp/nvc++ZygezJ6dl7dc.ll Unlinking /tmp/nvc++lygeH_UOXbJH.llvm compiling src/tidl_const.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_const.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++bLged4lQmIW7.il src/tidl_const.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_const.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_const.c -il /tmp/nvc++bLged4lQmIW7.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++bLged4lQmIW7.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_const.obj' -asm /tmp/nvc++rLgeZ7bQAYw_.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++rLgeZ7bQAYw_.ll -S -o /tmp/nvc++zLgelAXMVKMt.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++zLgelAXMVKMt.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_const.obj Unlinking /tmp/nvc++bLged4lQmIW7.il Unlinking /tmp/nvc++jLgeB4kaDOZd.s Unlinking /tmp/nvc++rLgeZ7bQAYw_.ll Unlinking /tmp/nvc++zLgelAXMVKMt.llvm compiling src/tidl_conv2d_base.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_conv2d_base.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++gTgesODz164r.il src/tidl_conv2d_base.c "src/tidl_conv2d_base.c", line 94: warning: incompatible redefinition of macro "_POSIX_C_SOURCE" (declared at line 290 of "/usr/include/features.h") [bad_macro_redef] #define _POSIX_C_SOURCE 200112L ^ Remark: individual warnings can be suppressed with "--diag_suppress " /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_conv2d_base.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_conv2d_base.c -il /tmp/nvc++gTgesODz164r.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++gTgesODz164r.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_conv2d_base.obj' -asm /tmp/nvc++MTgeY_F0xL-i.ll long TIDL_openaccShiftRight(long, int): 78, include "tidl_alg_int.h" 1432, Generating acc routine seq Generating NVIDIA GPU code long TIDL_openaccShiftRightImpl(long, int): 78, include "tidl_alg_int.h" 1426, Generating acc routine seq Generating NVIDIA GPU code void TIDL_refConv2d(float*, float*, float*, float*, float*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, float, float, float, float>(float*, float*, float*, float*, float*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, float, float, float, float>(float*, float*, float*, float*, float*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(float const*, float const*, float const*, float*, float*, float*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) float TIDL_findMinMaxForChQuant(float*, int, int, int, int, int, int, int, int, int, int, int, float*, float, float*, float*): 178, Generating copyin(perChannelWeightScalePtr[:numGroups]) [if not already present] Generating implicit copyin(min) [if not already present] Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outBatchPitch*(numBatches-1))+(outChPitch*(numOutChannels*(numGroups-1)))))+1]) Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numGroups,numBatches,height) Generating NVIDIA GPU code 178, #pragma acc loop seq collapse(2) 180, collapsed */ 183, #pragma acc loop seq 186, #pragma acc loop seq 189, #pragma acc loop seq 178, Generating implicit copyin(max) [if not already present] 180, Complex loop carried dependence of max-> prevents parallelization Loop carried scalar dependence for absMax at line 242 Complex loop carried dependence of min-> prevents parallelization Generating implicit firstprivate(absMax,maxChIdx) Complex loop carried dependence of max-> prevents parallelization 189, Generating implicit firstprivate(inDataVal,outBatchPitch,outImPitch,tensorScale,outdataOffset,outChPitch,inDataFloat,accScale) 205, Accelerator restriction: induction variable live-out from loop: maxChIdx 210, Accelerator restriction: induction variable live-out from loop: maxChIdx void TIDL_refConv2d(signed char*, signed char*, int*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, signed char, signed char, int, int>(signed char*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, signed char, signed char, int, int>(signed char*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(signed char const*, signed char const*, int const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) float TIDL_findMinMaxForChQuant(int*, int, int, int, int, int, int, int, int, int, int, int, float*, float, float*, float*): 178, Generating copyin(perChannelWeightScalePtr[:numGroups]) [if not already present] Generating implicit copyin(min) [if not already present] Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outBatchPitch*(numBatches-1))+(outChPitch*(numOutChannels*(numGroups-1)))))+1]) Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numGroups,numBatches,height) Generating NVIDIA GPU code 178, #pragma acc loop seq collapse(2) 180, collapsed */ 183, #pragma acc loop seq 186, #pragma acc loop seq 189, #pragma acc loop seq 178, Generating implicit copyin(max) [if not already present] 180, Complex loop carried dependence of max-> prevents parallelization Loop carried scalar dependence for absMax at line 242 Complex loop carried dependence of min-> prevents parallelization Generating implicit firstprivate(absMax,maxChIdx) Complex loop carried dependence of max-> prevents parallelization 189, Generating implicit firstprivate(inDataVal,outBatchPitch,outImPitch,tensorScale,outdataOffset,outChPitch,inDataFloat,accScale) 205, Accelerator restriction: induction variable live-out from loop: maxChIdx 210, Accelerator restriction: induction variable live-out from loop: maxChIdx void TIDL_refConv2d(signed char*, signed char*, int*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, signed char*, int*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, signed char*, int*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, signed char*, int*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, unsigned char, signed char, int, int>(unsigned char*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, unsigned char, signed char, int, int>(unsigned char*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(unsigned char const*, signed char const*, int const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(unsigned char*, signed char*, int*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, signed char*, int*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, signed char*, int*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, signed char*, int*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, short, signed char, int, int>(short*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, short, signed char, int, int>(short*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(short const*, signed char const*, int const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(short*, signed char*, int*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, signed char*, int*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, signed char*, int*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, signed char*, int*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, unsigned short, signed char, int, int>(unsigned short*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, unsigned short, signed char, int, int>(unsigned short*, signed char*, int*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(unsigned short const*, signed char const*, int const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(unsigned short*, signed char*, int*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, signed char*, int*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, signed char*, int*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, signed char*, short*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, signed char, signed char, short, int>(signed char*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, signed char, signed char, short, int>(signed char*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(signed char const*, signed char const*, short const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(signed char*, signed char*, short*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, signed char*, short*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, signed char*, short*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, signed char*, short*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, unsigned char, signed char, short, int>(unsigned char*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, unsigned char, signed char, short, int>(unsigned char*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(unsigned char const*, signed char const*, short const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(unsigned char*, signed char*, short*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, signed char*, short*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, signed char*, short*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, signed char*, short*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, short, signed char, short, int>(short*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, short, signed char, short, int>(short*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(short const*, signed char const*, short const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(short*, signed char*, short*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, signed char*, short*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, signed char*, short*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, signed char*, short*, signed char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, unsigned short, signed char, short, int>(unsigned short*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, unsigned short, signed char, short, int>(unsigned short*, signed char*, short*, int*, int*, int*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(unsigned short const*, signed char const*, short const*, int*, int*, int*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(unsigned short*, signed char*, short*, unsigned char*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, signed char*, short*, short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, signed char*, short*, unsigned short*, int*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, short*, long*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, signed char, short, long, long>(signed char*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, signed char, short, long, long>(signed char*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(signed char const*, short const*, long const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) float TIDL_findMinMaxForChQuant(long*, int, int, int, int, int, int, int, int, int, int, int, float*, float, float*, float*): 178, Generating copyin(perChannelWeightScalePtr[:numGroups]) [if not already present] Generating implicit copyin(min) [if not already present] Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outBatchPitch*(numBatches-1))+(outChPitch*(numOutChannels*(numGroups-1)))))+1]) Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numGroups,numBatches,height) Generating NVIDIA GPU code 178, #pragma acc loop seq collapse(2) 180, collapsed */ 183, #pragma acc loop seq 186, #pragma acc loop seq 189, #pragma acc loop seq 178, Generating implicit copyin(max) [if not already present] 180, Complex loop carried dependence of max-> prevents parallelization Loop carried scalar dependence for absMax at line 242 Complex loop carried dependence of min-> prevents parallelization Generating implicit firstprivate(absMax,maxChIdx) Complex loop carried dependence of max-> prevents parallelization 189, Generating implicit firstprivate(inDataVal,outBatchPitch,outImPitch,tensorScale,outdataOffset,outChPitch,inDataFloat,accScale) 205, Accelerator restriction: induction variable live-out from loop: maxChIdx 210, Accelerator restriction: induction variable live-out from loop: maxChIdx void TIDL_refConv2d(signed char*, short*, long*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, short*, long*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, short*, long*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, short*, long*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, unsigned char, short, long, long>(unsigned char*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, unsigned char, short, long, long>(unsigned char*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(unsigned char const*, short const*, long const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(unsigned char*, short*, long*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, short*, long*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, short*, long*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, short*, long*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, short, short, long, long>(short*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, short, short, long, long>(short*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(short const*, short const*, long const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(short*, short*, long*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, short*, long*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, short*, long*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, short*, long*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, unsigned short, short, long, long>(unsigned short*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, unsigned short, short, long, long>(unsigned short*, short*, long*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(unsigned short const*, short const*, long const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(unsigned short*, short*, long*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, short*, long*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, short*, long*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, short*, int*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, signed char, short, int, long>(signed char*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, signed char, short, int, long>(signed char*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(signed char const*, short const*, int const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(signed char*, short*, int*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, short*, int*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(signed char*, short*, int*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, short*, int*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, unsigned char, short, int, long>(unsigned char*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, unsigned char, short, int, long>(unsigned char*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(unsigned char const*, short const*, int const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(unsigned char*, short*, int*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, short*, int*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned char*, short*, int*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, short*, int*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, short, short, int, long>(short*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, short, short, int, long>(short*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(short const*, short const*, int const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(short*, short*, int*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, short*, int*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(short*, short*, int*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, short*, int*, signed char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2dKernelFast<1, unsigned short, short, int, long>(unsigned short*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight))))+1],pInChannel[:((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernelFast<3, unsigned short, short, int, long>(unsigned short*, short*, int*, long*, long*, long*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 473, Generating present(pCoeffs[:(coeffsWidth*2)+(((numInChannels-1)*(coeffsWidth*coeffsHeight))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*((numInChannels*(numGroups-1))*coeffsHeight)))))+3],pInChannel[:(dilationWidth*2)+((dilationHeight*(inImPitch*2))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((((height%strideHeight)+(height-strideHeight))*outImPitch)+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(numGroups,strideHeight,topPad,width,pInChannel,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 496, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 498, /* blockIdx.x threadIdx.x collapsed */ 500, /* blockIdx.x threadIdx.x collapsed */ 502, /* blockIdx.x threadIdx.x collapsed */ 504, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 519, #pragma acc loop seq 524, #pragma acc loop seq 527, #pragma acc loop seq 504, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 519, Generating implicit firstprivate(coeffsHeight,coeffsWidth) 527, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2dKernel(unsigned short const*, short const*, int const*, long*, long*, long*, int, int, int, int, int, int, unsigned int, unsigned int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 310, Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*(inImPitch*(coeffsHeight-1)))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)],accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+(((numOutChannels-1)*outChPitch)+(((numBatches-1)*outBatchPitch)+(((numGroups-1)*numOutChannels)*outChPitch))))+1]) Generating implicit firstprivate(coeffsHeight,numGroups,strideHeight,topPad,width,pInChannel,coeffsWidth,numInChannels,numBatches,leftPad,inWidth,isOTFpad,inHeight,strideWidth,inImPitch,height,numOutChannels) Generating NVIDIA GPU code 333, #pragma acc loop gang, vector(128) collapse(5) /* blockIdx.x threadIdx.x */ 335, /* blockIdx.x threadIdx.x collapsed */ 337, /* blockIdx.x threadIdx.x collapsed */ 339, /* blockIdx.x threadIdx.x collapsed */ 341, /* blockIdx.x threadIdx.x collapsed */ Generating reduction(min:_min) Generating reduction(max:_max) 355, #pragma acc loop seq 360, #pragma acc loop seq 363, #pragma acc loop seq 341, Generating implicit firstprivate(enableBias,inBatchPitch,inChPitch,outBatchPitch,outImPitch,outChPitch) 363, Generating implicit firstprivate(dilationHeight,startRowNumberInTensor,padVal,dilationWidth) void TIDL_refConv2d(unsigned short*, short*, int*, unsigned char*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, short*, int*, short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) void TIDL_refConv2d(unsigned short*, short*, int*, unsigned short*, long*, TIDL_CreateParams const*, int, sTIDL_ConvParams_t*, tidlConv2dBuffParams_t*, sTIDL_AlgLayer_t*): 733, Generating copyin(pInChannel[:(dilationWidth*(coeffsWidth-1))+((dilationHeight*((coeffsHeight-1)*inImPitch))+(((width%strideWidth)+(width-strideWidth))+((inImPitch*((height%strideHeight)+(height-strideHeight)))+((inChPitch*(numInChannels-1))+((inBatchPitch*(numBatches-1))+(inChPitch*(numInChannels*(numGroups-1))))))))+1],pBias[:numOutChannels+((numGroups-1)*numOutChannels)]) [if not already present] Generating present(pCoeffs[:coeffsWidth+((coeffsWidth*(coeffsHeight-1))+((coeffsWidth*(coeffsHeight*(numInChannels-1)))+((coeffsWidth*(coeffsHeight*(numInChannels*(numOutChannels-1))))+(numOutChannels*(coeffsWidth*(coeffsHeight*(numInChannels*(numGroups-1))))))))],accPtr[:numBatches*outBatchPitch]) 906, Generating present(accPtr[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) Generating copy(pOutChanne[:(((width%strideWidth)+(width-strideWidth))/strideWidth)+((outImPitch*(((height%strideHeight)+(height-strideHeight))/strideHeight))+((outChPitch*(numOutChannels-1))+(outBatchPitch*(numBatches-1))))+1]) [if not already present] 911, Generating implicit firstprivate(strideHeight,strideWidth,width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 911, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 913, /* blockIdx.x threadIdx.x collapsed */ 915, /* blockIdx.x threadIdx.x collapsed */ 917, /* blockIdx.x threadIdx.x collapsed */ 911, Generating implicit copyin(mmav2_Scales[:numOutChannels],mmav2_Shifts[:numOutChannels],tidlLayer,roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 917, Generating implicit firstprivate(outAcc,outRoundBits,outImPitch,outChPitch,outBatchPitch,mmaPSATMin,tempAcc,outdataOffset,mmaPSATMax,mixedPrecision,enablePerChannelShift) 961, Generating implicit firstprivate(strideHeight,width,strideWidth,numOutChannels,numBatches,height) Generating NVIDIA GPU code 961, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 963, /* blockIdx.x threadIdx.x collapsed */ 965, /* blockIdx.x threadIdx.x collapsed */ 967, /* blockIdx.x threadIdx.x collapsed */ 961, Generating implicit copyin(roundBitsPtr[:numOutChannels],pTIDLNet) [if not already present] 967, Generating implicit firstprivate(outAcc,outRoundBits,outElementType,outBatchPitch,mixedPrecision,outChPitch,outImPitch,satLow,satHigh,outdataOffset,enablePerChannelShift) std::fabs(float): 78, include "tidl_alg_int.h" 88, include "mmalib_cnn.h" 26, include "MMALIB_CNN_convolve_col_smallNo_ixX_ixX_oxX.h" 25, include "MMALIB_types.h" 35, include "c7x.h" 67, include "vector.h" 51, include "c7x_c_funcs.h" 41, include "cmath" 15, include "cmath" 242, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacceWgemkX4NEtn.gpu -computecap 86 -ptx /tmp/nvacceWgem5n6NRPb.ptx -o /tmp/nvaccKWgeSc99ei3K.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_conv2d_base.c -sm 86 /tmp/nvaccKWgeSc99ei3K.bin -compute 86 /tmp/nvacceWgem5n6NRPb.ptx -o /tmp/nvacceWgemyLVfjWr.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++MTgeY_F0xL-i.ll -S -o /tmp/nvc++wTgec1FJu987.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++wTgec1FJu987.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_conv2d_base.obj Unlinking /tmp/nvc++gTgesODz164r.il Unlinking /tmp/nvc++2TgeIQkaSM-P.s Unlinking /tmp/nvc++MTgeY_F0xL-i.ll Unlinking /tmp/nvc++wTgec1FJu987.llvm compiling src/tidl_crop.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_crop.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++F6geDTGXueWP.il src/tidl_crop.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_crop.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_crop.c -il /tmp/nvc++F6geDTGXueWP.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++F6geDTGXueWP.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_crop.obj' -asm /tmp/nvc++p6geTHz5hMEf.ll void TIDL_refCrop(float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int): 118, Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present] Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present] Generating implicit firstprivate(outHeight,numROIs,numChs,outWidth) Generating NVIDIA GPU code 118, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 120, /* blockIdx.x threadIdx.x collapsed */ 122, /* blockIdx.x threadIdx.x collapsed */ 124, /* blockIdx.x threadIdx.x collapsed */ 124, Generating implicit firstprivate(inChPitch,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outROIPitch,outPtrOffset) void TIDL_refCrop(unsigned char const*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int): 118, Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present] Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present] Generating implicit firstprivate(outWidth,outHeight,numROIs,numChs) Generating NVIDIA GPU code 118, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 120, /* blockIdx.x threadIdx.x collapsed */ 122, /* blockIdx.x threadIdx.x collapsed */ 124, /* blockIdx.x threadIdx.x collapsed */ 124, Generating implicit firstprivate(inChPitch,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outROIPitch,outPtrOffset) void TIDL_refCrop(unsigned short const*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int): 118, Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present] Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present] Generating implicit firstprivate(outHeight,numROIs,numChs,outWidth) Generating NVIDIA GPU code 118, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 120, /* blockIdx.x threadIdx.x collapsed */ 122, /* blockIdx.x threadIdx.x collapsed */ 124, /* blockIdx.x threadIdx.x collapsed */ 124, Generating implicit firstprivate(inChPitch,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outROIPitch,outPtrOffset) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacc29geIvlkLac3.gpu -computecap 86 -ptx /tmp/nvaccw9gecqwofWIR.ptx -o /tmp/nvaccg9gesqie2W4q.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_crop.c -sm 86 /tmp/nvaccg9gesqie2W4q.bin -compute 86 /tmp/nvaccw9gecqwofWIR.ptx -o /tmp/nvacc29geIskah2_O.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++p6geTHz5hMEf.ll -S -o /tmp/nvc++h6gevVfOTHKV.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++h6gevVfOTHKV.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_crop.obj Unlinking /tmp/nvc++F6geDTGXueWP.il Unlinking /tmp/nvc++x6gefzgjfiHU.s Unlinking /tmp/nvc++p6geTHz5hMEf.ll Unlinking /tmp/nvc++h6gevVfOTHKV.llvm compiling src/tidl_custom_int.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_custom_int.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++GhheGPv7J_Wj.il src/tidl_custom_int.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_custom_int.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_custom_int.c -il /tmp/nvc++GhheGPv7J_Wj.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++GhheGPv7J_Wj.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_custom_int.obj' -asm /tmp/nvc++GhheGHzMH98i.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++GhheGHzMH98i.ll -S -o /tmp/nvc++GhheGTLPYQcx.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++GhheGTLPYQcx.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_custom_int.obj Unlinking /tmp/nvc++GhheGPv7J_Wj.il Unlinking /tmp/nvc++GhheG1cMQ54j.s Unlinking /tmp/nvc++GhheGHzMH98i.ll Unlinking /tmp/nvc++GhheGTLPYQcx.llvm compiling src/tidl_dataConvert.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_dataConvert.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++HpheJvgdQqMM.il src/tidl_dataConvert.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_dataConvert.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_dataConvert.c -il /tmp/nvc++HpheJvgdQqMM.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++HpheJvgdQqMM.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_dataConvert.obj' -asm /tmp/nvc++XphetrbS4KwC.ll void TIDL_TensorMinMaxinFloat(float const*, sTIDL_DataParams_t const*, float, float, float*, float*): 442, Generating implicit copyin(min) [if not already present] Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present] Generating NVIDIA GPU code 442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 444, /* blockIdx.x threadIdx.x collapsed */ 446, /* blockIdx.x threadIdx.x collapsed */ 448, /* blockIdx.x threadIdx.x collapsed */ 450, /* blockIdx.x threadIdx.x collapsed */ 452, /* blockIdx.x threadIdx.x collapsed */ 442, Generating implicit copyin(dataPrms,max) [if not already present] 452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale) void TIDL_TensorMinMaxinFloat(signed char const*, sTIDL_DataParams_t const*, float, float, float*, float*): 442, Generating implicit copyin(min) [if not already present] Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present] Generating NVIDIA GPU code 442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 444, /* blockIdx.x threadIdx.x collapsed */ 446, /* blockIdx.x threadIdx.x collapsed */ 448, /* blockIdx.x threadIdx.x collapsed */ 450, /* blockIdx.x threadIdx.x collapsed */ 452, /* blockIdx.x threadIdx.x collapsed */ 442, Generating implicit copyin(dataPrms,max) [if not already present] 452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale) void TIDL_TensorMinMaxinFloat(unsigned char const*, sTIDL_DataParams_t const*, float, float, float*, float*): 442, Generating implicit copyin(min) [if not already present] Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present] Generating NVIDIA GPU code 442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 444, /* blockIdx.x threadIdx.x collapsed */ 446, /* blockIdx.x threadIdx.x collapsed */ 448, /* blockIdx.x threadIdx.x collapsed */ 450, /* blockIdx.x threadIdx.x collapsed */ 452, /* blockIdx.x threadIdx.x collapsed */ 442, Generating implicit copyin(dataPrms,max) [if not already present] 452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale) void TIDL_TensorMinMaxinFloat(short const*, sTIDL_DataParams_t const*, float, float, float*, float*): 442, Generating implicit copyin(min) [if not already present] Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present] Generating NVIDIA GPU code 442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 444, /* blockIdx.x threadIdx.x collapsed */ 446, /* blockIdx.x threadIdx.x collapsed */ 448, /* blockIdx.x threadIdx.x collapsed */ 450, /* blockIdx.x threadIdx.x collapsed */ 452, /* blockIdx.x threadIdx.x collapsed */ 442, Generating implicit copyin(dataPrms,max) [if not already present] 452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale) void TIDL_TensorMinMaxinFloat(unsigned short const*, sTIDL_DataParams_t const*, float, float, float*, float*): 442, Generating implicit copyin(min) [if not already present] Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present] Generating NVIDIA GPU code 442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 444, /* blockIdx.x threadIdx.x collapsed */ 446, /* blockIdx.x threadIdx.x collapsed */ 448, /* blockIdx.x threadIdx.x collapsed */ 450, /* blockIdx.x threadIdx.x collapsed */ 452, /* blockIdx.x threadIdx.x collapsed */ 442, Generating implicit copyin(dataPrms,max) [if not already present] 452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale) void TIDL_TensorMinMaxinFloat(int const*, sTIDL_DataParams_t const*, float, float, float*, float*): 442, Generating implicit copyin(min) [if not already present] Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present] Generating NVIDIA GPU code 442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 444, /* blockIdx.x threadIdx.x collapsed */ 446, /* blockIdx.x threadIdx.x collapsed */ 448, /* blockIdx.x threadIdx.x collapsed */ 450, /* blockIdx.x threadIdx.x collapsed */ 452, /* blockIdx.x threadIdx.x collapsed */ 442, Generating implicit copyin(dataPrms,max) [if not already present] 452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale) void TIDL_TensorMinMaxinFloat(unsigned int const*, sTIDL_DataParams_t const*, float, float, float*, float*): 442, Generating implicit copyin(min) [if not already present] Generating copyin(ptr[:dataPrms->dimValues+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+((dataPrms->pitch*(dataPrms->dimValues-1))+padOffset)))))]) [if not already present] Generating NVIDIA GPU code 442, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 444, /* blockIdx.x threadIdx.x collapsed */ 446, /* blockIdx.x threadIdx.x collapsed */ 448, /* blockIdx.x threadIdx.x collapsed */ 450, /* blockIdx.x threadIdx.x collapsed */ 452, /* blockIdx.x threadIdx.x collapsed */ 442, Generating implicit copyin(dataPrms,max) [if not already present] 452, Generating implicit firstprivate(in_zf,val,padOffset,in_scale) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccjsheBr6js0E0.gpu -computecap 86 -ptx /tmp/nvacczsheldW7IjaM.ptx -o /tmp/nvaccHsheJCqkPvHN.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_dataConvert.c -sm 86 /tmp/nvaccHsheJCqkPvHN.bin -compute 86 /tmp/nvacczsheldW7IjaM.ptx -o /tmp/nvaccPshe7a0qqdmb.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++XphetrbS4KwC.ll -S -o /tmp/nvc++5pheRBRitJwq.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++5pheRBRitJwq.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_dataConvert.obj Unlinking /tmp/nvc++HpheJvgdQqMM.il Unlinking /tmp/nvc++Pphe7bzW7sER.s Unlinking /tmp/nvc++XphetrbS4KwC.ll Unlinking /tmp/nvc++5pheRBRitJwq.llvm compiling src/tidl_deconv2d.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_deconv2d.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++RChebMdJaQgj.il src/tidl_deconv2d.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_deconv2d.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_deconv2d.c -il /tmp/nvc++RChebMdJaQgj.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++RChebMdJaQgj.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_deconv2d.obj' -asm /tmp/nvc++BCherhJYZkLl.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++BCherhJYZkLl.ll -S -o /tmp/nvc++ZChezbCaz87x.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++ZChezbCaz87x.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_deconv2d.obj Unlinking /tmp/nvc++RChebMdJaQgj.il Unlinking /tmp/nvc++dChejwa9rDnz.s Unlinking /tmp/nvc++BCherhJYZkLl.ll Unlinking /tmp/nvc++ZChezbCaz87x.llvm compiling src/tidl_depthToSpace.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_depthToSpace.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++XKhetojn52pn.il src/tidl_depthToSpace.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_depthToSpace.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_depthToSpace.c -il /tmp/nvc++XKhetojn52pn.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++XKhetojn52pn.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_depthToSpace.obj' -asm /tmp/nvc++bKhedgRPib1C.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++bKhedgRPib1C.ll -S -o /tmp/nvc++jKheB2J7cFCy.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++jKheB2J7cFCy.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_depthToSpace.obj Unlinking /tmp/nvc++XKhetojn52pn.il Unlinking /tmp/nvc++5KheRaZskrnc.s Unlinking /tmp/nvc++bKhedgRPib1C.ll Unlinking /tmp/nvc++jKheB2J7cFCy.llvm compiling src/tidl_detectionOutput.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_detectionOutput.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++RShebT00TLNV.il src/tidl_detectionOutput.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_detectionOutput.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_detectionOutput.c -il /tmp/nvc++RShebT00TLNV.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++RShebT00TLNV.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput.obj' -asm /tmp/nvc++BSher_SrbdFy.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++BSher_SrbdFy.ll -S -o /tmp/nvc++ZShezW04ltNX.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++ZShezW04ltNX.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput.obj Unlinking /tmp/nvc++RShebT00TLNV.il Unlinking /tmp/nvc++dShejv68Kg3j.s Unlinking /tmp/nvc++BSher_SrbdFy.ll Unlinking /tmp/nvc++ZShezW04ltNX.llvm compiling src/tidl_detectionOutput_score.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_detectionOutput_score.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++P0he7a5SFwdx.il src/tidl_detectionOutput_score.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_detectionOutput_score.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_detectionOutput_score.c -il /tmp/nvc++P0he7a5SFwdx.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++P0he7a5SFwdx.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput_score.obj' -asm /tmp/nvc++50heRkkTZlvW.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++50heRkkTZlvW.ll -S -o /tmp/nvc++b0hedG7wju5h.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++b0hedG7wju5h.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput_score.obj Unlinking /tmp/nvc++P0he7a5SFwdx.il Unlinking /tmp/nvc++X0hetiHSgZii.s Unlinking /tmp/nvc++50heRkkTZlvW.ll Unlinking /tmp/nvc++b0hedG7wju5h.llvm compiling src/tidl_device_functions.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_device_functions.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++M8heYWtG4_cV.il src/tidl_device_functions.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_device_functions.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_device_functions.c -il /tmp/nvc++M8heYWtG4_cV.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++M8heYWtG4_cV.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_device_functions.obj' -asm /tmp/nvc++g8hesVa-eU7Z.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++g8hesVa-eU7Z.ll -S -o /tmp/nvc++28heIPgYMWQS.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++28heIPgYMWQS.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_device_functions.obj Unlinking /tmp/nvc++M8heYWtG4_cV.il Unlinking /tmp/nvc++w8hec3XT5gS7.s Unlinking /tmp/nvc++g8hesVa-eU7Z.ll Unlinking /tmp/nvc++28heIPgYMWQS.llvm compiling src/tidl_eltWise.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_eltWise.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++7eieXIWqqMaA.il src/tidl_eltWise.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_eltWise.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_eltWise.c -il /tmp/nvc++7eieXIWqqMaA.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++7eieXIWqqMaA.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_eltWise.obj' -asm /tmp/nvc++ReiebaC4dffT.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++ReiebaC4dffT.ll -S -o /tmp/nvc++deiejaC9jUe_.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++deiejaC9jUe_.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_eltWise.obj Unlinking /tmp/nvc++7eieXIWqqMaA.il Unlinking /tmp/nvc++teie54uGHVVR.s Unlinking /tmp/nvc++ReiebaC4dffT.ll Unlinking /tmp/nvc++deiejaC9jUe_.llvm compiling src/tidl_flatten.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_flatten.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++8mie0U19FG9L.il src/tidl_flatten.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_flatten.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_flatten.c -il /tmp/nvc++8mie0U19FG9L.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++8mie0U19FG9L.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_flatten.obj' -asm /tmp/nvc++8mie0panCmCN.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++8mie0panCmCN.ll -S -o /tmp/nvc++CmieuocJUMAn.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++CmieuocJUMAn.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_flatten.obj Unlinking /tmp/nvc++8mie0U19FG9L.il Unlinking /tmp/nvc++CmieuLJ62m2h.s Unlinking /tmp/nvc++8mie0panCmCN.ll Unlinking /tmp/nvc++CmieuocJUMAn.llvm compiling src/tidl_function_mapping.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_function_mapping.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++GuieGS2SohYm.il src/tidl_function_mapping.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_function_mapping.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_function_mapping.c -il /tmp/nvc++GuieGS2SohYm.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++GuieGS2SohYm.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_function_mapping.obj' -asm /tmp/nvc++GuieGuorsK85.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++GuieGuorsK85.ll -S -o /tmp/nvc++GuieG4alGlBR.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++GuieG4alGlBR.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_function_mapping.obj Unlinking /tmp/nvc++GuieGS2SohYm.il Unlinking /tmp/nvc++GuieGqSdFOsr.s Unlinking /tmp/nvc++GuieGuorsK85.ll Unlinking /tmp/nvc++GuieG4alGlBR.llvm compiling src/tidl_gatherLayer.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_gatherLayer.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++aCieaB0_dRUO.il src/tidl_gatherLayer.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_gatherLayer.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_gatherLayer.c -il /tmp/nvc++aCieaB0_dRUO.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++aCieaB0_dRUO.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_gatherLayer.obj' -asm /tmp/nvc++aCiealwGbqXv.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++aCiealwGbqXv.ll -S -o /tmp/nvc++aCieaJhdsZL9.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++aCieaJhdsZL9.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_gatherLayer.obj Unlinking /tmp/nvc++aCieaB0_dRUO.il Unlinking /tmp/nvc++aCieaZxvkO00.s Unlinking /tmp/nvc++aCiealwGbqXv.ll Unlinking /tmp/nvc++aCieaJhdsZL9.llvm compiling src/tidl_innerProduct.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_innerProduct.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++bKiedFzxj5P3.il src/tidl_innerProduct.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_innerProduct.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_innerProduct.c -il /tmp/nvc++bKiedFzxj5P3.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++bKiedFzxj5P3.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_innerProduct.obj' -asm /tmp/nvc++rKieZsq4yd1u.ll long TIDL_openaccShiftRight(long, int): 71, include "tidl_innerProduct.h" 73, include "tidl_alg_int.h" 1432, Generating acc routine seq Generating NVIDIA GPU code long TIDL_openaccShiftRightImpl(long, int): 71, include "tidl_innerProduct.h" 73, include "tidl_alg_int.h" 1426, Generating acc routine seq Generating NVIDIA GPU code /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccrNieZoyCAS4b.gpu -computecap 86 -ptx /tmp/nvaccHNieJUQBQlW7.ptx -o /tmp/nvaccPNie7IGTXOr3.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_innerProduct.c -sm 86 /tmp/nvaccPNie7IGTXOr3.bin -compute 86 /tmp/nvaccHNieJUQBQlW7.ptx -o /tmp/nvaccXNiet6eYyV_6.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++rKieZsq4yd1u.ll -S -o /tmp/nvc++zKielPYgY-sM.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++zKielPYgY-sM.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_innerProduct.obj Unlinking /tmp/nvc++bKiedFzxj5P3.il Unlinking /tmp/nvc++jKieBxf3A9vD.s Unlinking /tmp/nvc++rKieZsq4yd1u.ll Unlinking /tmp/nvc++zKielPYgY-sM.llvm compiling src/tidl_layerNorm.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_layerNorm.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++uXie83VR3NXb.il src/tidl_layerNorm.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_layerNorm.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_layerNorm.c -il /tmp/nvc++uXie83VR3NXb.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++uXie83VR3NXb.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_layerNorm.obj' -asm /tmp/nvc++uXie8YLR1FpY.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++uXie8YLR1FpY.ll -S -o /tmp/nvc++0XieCkuIg-9o.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++0XieCkuIg-9o.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_layerNorm.obj Unlinking /tmp/nvc++uXie83VR3NXb.il Unlinking /tmp/nvc++0XieC3UdEHYb.s Unlinking /tmp/nvc++uXie8YLR1FpY.ll Unlinking /tmp/nvc++0XieCkuIg-9o.llvm compiling src/tidl_odOutputReformat.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_odOutputReformat.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++j5ieBysi-ttD.il src/tidl_odOutputReformat.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_odOutputReformat.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_odOutputReformat.c -il /tmp/nvc++j5ieBysi-ttD.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++j5ieBysi-ttD.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_odOutputReformat.obj' -asm /tmp/nvc++z5iel6q4t-wj.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++z5iel6q4t-wj.ll -S -o /tmp/nvc++H5ieJl4kPyVB.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++H5ieJl4kPyVB.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_odOutputReformat.obj Unlinking /tmp/nvc++j5ieBysi-ttD.il Unlinking /tmp/nvc++r5ieZBfZMc0u.s Unlinking /tmp/nvc++z5iel6q4t-wj.ll Unlinking /tmp/nvc++H5ieJl4kPyVB.llvm compiling src/tidl_pad.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_pad.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++zbjelSHyIg6n.il src/tidl_pad.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_pad.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_pad.c -il /tmp/nvc++zbjelSHyIg6n.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++zbjelSHyIg6n.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pad.obj' -asm /tmp/nvc++Pbje75ORX0fG.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++Pbje75ORX0fG.ll -S -o /tmp/nvc++XbjetjqbR5oz.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++XbjetjqbR5oz.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pad.obj Unlinking /tmp/nvc++zbjelSHyIg6n.il Unlinking /tmp/nvc++HbjeJnpTJXYH.s Unlinking /tmp/nvc++Pbje75ORX0fG.ll Unlinking /tmp/nvc++XbjetjqbR5oz.llvm compiling src/tidl_pooling.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_pooling.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++Bjjer0-tWQnd.il src/tidl_pooling.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_pooling.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_pooling.c -il /tmp/nvc++Bjjer0-tWQnd.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++Bjjer0-tWQnd.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pooling.obj' -asm /tmp/nvc++ljjeHjXEJrH3.ll void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling(unsigned char*, int, int, int, int, int, int, unsigned char*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*): 180, Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present] Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] 185, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 185, #pragma acc loop gang collapse(2) /* blockIdx.x */ 187, /* blockIdx.x collapsed */ 191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(max:maxValue) 193, /* threadIdx.x collapsed */ 187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue) 191, Loop is parallelizable 193, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch) void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling(signed char*, int, int, int, int, int, int, signed char*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*): 180, Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present] Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] 185, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 185, #pragma acc loop gang collapse(2) /* blockIdx.x */ 187, /* blockIdx.x collapsed */ 191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(max:maxValue) 193, /* threadIdx.x collapsed */ 187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue) 191, Loop is parallelizable 193, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch) void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling(unsigned short*, int, int, int, int, int, int, unsigned short*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*): 180, Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present] Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] 185, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 185, #pragma acc loop gang collapse(2) /* blockIdx.x */ 187, /* blockIdx.x collapsed */ 191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(max:maxValue) 193, /* threadIdx.x collapsed */ 187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue) 191, Loop is parallelizable 193, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch) void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling(short*, int, int, int, int, int, int, short*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*): 180, Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present] Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] 185, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 185, #pragma acc loop gang collapse(2) /* blockIdx.x */ 187, /* blockIdx.x collapsed */ 191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(max:maxValue) 193, /* threadIdx.x collapsed */ 187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue) 191, Loop is parallelizable 193, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch) void _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalMaxPooling(float*, int, int, int, int, int, int, float*, sTIDL_Layer_t const*, sTIDL_DataParams_t*, sTIDL_Network_t*, sTIDL_AlgLayer_t const*): 180, Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch)+1]) [if not already present] 185, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 185, #pragma acc loop gang collapse(2) /* blockIdx.x */ 187, /* blockIdx.x collapsed */ 191, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(max:maxValue) 193, /* threadIdx.x collapsed */ 187, Generating implicit firstprivate(initValue,outChPitch,outBatchPitch,maxValue) 191, Loop is parallelizable 193, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,input,inPitch) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPoolingv2(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, signed char*, int*, unsigned char, int, unsigned char, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 261, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] 268, Generating implicit firstprivate(numOutChannels,width,numBatches,height) Generating NVIDIA GPU code 268, #pragma acc loop gang collapse(2) /* blockIdx.x */ 270, /* blockIdx.x collapsed */ 274, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 276, /* threadIdx.x collapsed */ 270, Generating implicit firstprivate(sumBlock,scaleValue,result,outChPitch,max,biasTerm,outBatchPitch,min) 274, Loop is parallelizable 276, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 326, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 326, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 328, /* blockIdx.x threadIdx.x collapsed */ 328, Generating implicit firstprivate(outBatchPitch,mmaShift,outChPitch,tempAcc,satLow,mixedPrecision,satHigh,result) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, unsigned char*, unsigned int*, unsigned int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, signed char*, int*, int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, unsigned short*, unsigned long*, unsigned long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, short*, long*, long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, unsigned char*, unsigned int*, unsigned int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, signed char*, int*, int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, unsigned short*, unsigned long*, unsigned long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, short*, long*, long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, unsigned char*, unsigned int*, unsigned int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, signed char*, int*, int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, unsigned short*, unsigned long*, unsigned long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, short*, long*, long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, short*, int, int, int, int, int, int, unsigned char*, unsigned int*, unsigned int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, short*, int, int, int, int, int, int, signed char*, int*, int, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, short*, int, int, int, int, int, int, unsigned short*, unsigned long*, unsigned long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, short*, int, int, int, int, int, int, short*, long*, long, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numOutChannels,numBatches) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 515, Generating implicit firstprivate(outBatchPitch,mixedPrecision,outChPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refGlobalAvgPooling(sTIDL_Network_t*, float*, int, int, int, int, int, int, float*, float*, float, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 435, Generating present(accPtr[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) Generating copyin(inData[:width+((inPitch*(height-1))+((inChPitch*(numOutChannels-1))+((inBatchPitch*(numBatches-1))+offsetOTF)))]) [if not already present] Generating copyout(outData[:((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))+1]) [if not already present] 442, Generating implicit firstprivate(width,numOutChannels,numBatches,height) Generating NVIDIA GPU code 442, #pragma acc loop gang collapse(2) /* blockIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 444, /* blockIdx.x collapsed */ 448, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ Generating reduction(+:sumBlock) 450, /* threadIdx.x collapsed */ 442, Generating implicit copy(max,min) [if not already present] 444, Generating implicit firstprivate(scaleValue,sumBlock,result,outChPitch,outBatchPitch) 448, Loop is parallelizable 450, Loop is parallelizable Generating implicit firstprivate(inChPitch,inBatchPitch,inRowCol,inPitch) 513, Generating implicit firstprivate(numBatches,numOutChannels) Generating NVIDIA GPU code 513, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */ 515, /* blockIdx.x threadIdx.x collapsed */ 513, Generating implicit copy(net) [if not already present] 515, Generating implicit firstprivate(outBatchPitch,layerIdx,result,outChPitch) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, signed char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, int, int, int, int, int, int, signed char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, signed char*, int, int, int, int, int, int, int, int, int, int, int, int, short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, signed char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, short*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, short*, int, int, int, int, int, int, int, int, int, int, int, int, signed char*, int*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, short*, int, int, int, int, int, int, int, int, int, int, int, int, unsigned short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, short*, int, int, int, int, int, int, int, int, int, int, int, int, short*, long*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] Generating copyin(inData[:]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(min,max) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,roundVal,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,mixedPrecision,outPitch,result,satLow,satHigh,roundBits) int _INTERNAL_18_src_tidl_pooling_c_80fd09fd::TIDL_refSpatialAvgPooling(sTIDL_Network_t*, float*, int, int, int, int, int, int, int, int, int, int, int, int, float*, float*, sTIDL_AlgLayer_t const*, sTIDL_Layer_t const*, sTIDL_DataParams_t*): 1228, Generating present(accPtr[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) Generating copyin(inData[:]) [if not already present] Generating copyout(outData[:numCols+((outPitch*(numRows-1))+(((numOutChannels-1)*outChPitch)+(outBatchPitch*(numBatches-1))))]) [if not already present] 1235, Generating implicit firstprivate(numBatches,numOutChannels,numRows,numCols) Generating NVIDIA GPU code 1235, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1237, /* blockIdx.x threadIdx.x collapsed */ 1239, /* blockIdx.x threadIdx.x collapsed */ 1241, /* blockIdx.x threadIdx.x collapsed */ 1269, #pragma acc loop seq 1272, #pragma acc loop seq 1235, Generating implicit copy(max,min) [if not already present] 1241, Generating implicit firstprivate(scaleValue,strideH,height,outPitch,kernelH,padH,sumBlock,strideW,result,padW,outChPitch,outBatchPitch,kernelW,width) 1272, Generating implicit firstprivate(spatialOffsetX,isOTFpad,spatialOffsetY,validPosXMax,startRowNumberInTensor,validPosYMin,validPosYMax,validPosXMin,inRowCol,isBorderPixel,inPitch,inChPitch,inBatchPitch) 1357, Generating implicit firstprivate(numBatches,numRows,numOutChannels,numCols) Generating NVIDIA GPU code 1357, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ Generating reduction(min:min) Generating reduction(max:max) 1359, /* blockIdx.x threadIdx.x collapsed */ 1361, /* blockIdx.x threadIdx.x collapsed */ 1363, /* blockIdx.x threadIdx.x collapsed */ 1357, Generating implicit copy(net,min,max) [if not already present] 1363, Generating implicit firstprivate(outChPitch,outBatchPitch,layerIdx,result,outPitch) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccjmjeBVZUsold.gpu -computecap 86 -ptx /tmp/nvacczmjelbFOIdnm.ptx -o /tmp/nvaccHmjeJk3CPFv-.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_pooling.c -sm 86 /tmp/nvaccHmjeJk3CPFv-.bin -compute 86 /tmp/nvacczmjelbFOIdnm.ptx -o /tmp/nvaccPmje7Eq4qDHx.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++ljjeHjXEJrH3.ll -S -o /tmp/nvc++JjjePvAMPwDo.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++JjjePvAMPwDo.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pooling.obj Unlinking /tmp/nvc++Bjjer0-tWQnd.il Unlinking /tmp/nvc++ZjjezwEnboPn.s Unlinking /tmp/nvc++ljjeHjXEJrH3.ll Unlinking /tmp/nvc++JjjePvAMPwDo.llvm compiling src/tidl_preEmption.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_preEmption.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++IwjeM0Q9XRsg.il src/tidl_preEmption.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_preEmption.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_preEmption.c -il /tmp/nvc++IwjeM0Q9XRsg.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++IwjeM0Q9XRsg.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_preEmption.obj' -asm /tmp/nvc++cwjegKmftH3v.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++cwjegKmftH3v.ll -S -o /tmp/nvc++swje2eX3WybW.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++swje2eX3WybW.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_preEmption.obj Unlinking /tmp/nvc++IwjeM0Q9XRsg.il Unlinking /tmp/nvc++YwjewzbKibwf.s Unlinking /tmp/nvc++cwjegKmftH3v.ll Unlinking /tmp/nvc++swje2eX3WybW.llvm compiling src/tidl_reduce.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_reduce.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++LEjeVCJeoSiB.il src/tidl_reduce.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_reduce.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_reduce.c -il /tmp/nvc++LEjeVCJeoSiB.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++LEjeVCJeoSiB.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reduce.obj' -asm /tmp/nvc++1EjeFx3LD3eR.ll TIDL_reduceProcessNew(TIDL_NetworkCommonParams*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, void**, void**, int): 361, Generating copyout(outLastLinePtr[:algLayer->scratchSize-1]) [if not already present] Generating implicit copyin(algLayer) [if not already present] Generating NVIDIA GPU code 365, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 372, Generating copyout(outLastLinePtr[:algLayer->scratchSize-1]) [if not already present] Generating implicit copyin(algLayer) [if not already present] Generating NVIDIA GPU code 376, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore(signed char*, signed char*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present] Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows) Generating NVIDIA GPU code 173, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 175, /* blockIdx.x threadIdx.x collapsed */ 177, /* blockIdx.x threadIdx.x collapsed */ 183, #pragma acc loop seq 194, #pragma acc loop seq 173, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present] 177, Generating implicit firstprivate(outPitch,targetVal,inChPitch) 183, Scalar last value needed after loop for targetVal at line 202,196 Generating implicit firstprivate(inPitch) Scalar last value needed after loop for targetVal at line 202 194, Scalar last value needed after loop for targetVal at line 202 int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore(unsigned char*, unsigned char*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present] Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows) Generating NVIDIA GPU code 173, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 175, /* blockIdx.x threadIdx.x collapsed */ 177, /* blockIdx.x threadIdx.x collapsed */ 183, #pragma acc loop seq 194, #pragma acc loop seq 173, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present] 177, Generating implicit firstprivate(outPitch,targetVal,inChPitch) 183, Scalar last value needed after loop for targetVal at line 202,196 Generating implicit firstprivate(inPitch) Scalar last value needed after loop for targetVal at line 202 194, Scalar last value needed after loop for targetVal at line 202 int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore(short*, short*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present] Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows) Generating NVIDIA GPU code 173, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 175, /* blockIdx.x threadIdx.x collapsed */ 177, /* blockIdx.x threadIdx.x collapsed */ 183, #pragma acc loop seq 194, #pragma acc loop seq 173, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present] 177, Generating implicit firstprivate(outPitch,targetVal,inChPitch) 183, Scalar last value needed after loop for targetVal at line 202,196 Generating implicit firstprivate(inPitch) Scalar last value needed after loop for targetVal at line 202 194, Scalar last value needed after loop for targetVal at line 202 int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore(unsigned short*, unsigned short*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present] Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows) Generating NVIDIA GPU code 173, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 175, /* blockIdx.x threadIdx.x collapsed */ 177, /* blockIdx.x threadIdx.x collapsed */ 183, #pragma acc loop seq 194, #pragma acc loop seq 173, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present] 177, Generating implicit firstprivate(outPitch,targetVal,inChPitch) 183, Scalar last value needed after loop for targetVal at line 202,196 Generating implicit firstprivate(inPitch) Scalar last value needed after loop for targetVal at line 202 194, Scalar last value needed after loop for targetVal at line 202 int _INTERNAL_17_src_tidl_reduce_c_46bfe33c::TIDL_refReduceCore(float*, float*, TIDL_Obj*, int, sTIDL_ReduceParams_t*, sTIDL_AlgLayer_t*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 173, Generating copyin(inData[:numCols+((inPitch*(numRows-1))+((inChPitch*(numInChannels-1))+((numTotRoi-1)*(inChPitch*numInChannels))))]) [if not already present] Generating implicit firstprivate(numInChannels,numCols,numTotRoi,numRows) Generating NVIDIA GPU code 173, #pragma acc loop gang collapse(3) /* blockIdx.x */ 175, /* blockIdx.x collapsed */ 177, /* blockIdx.x collapsed */ 183, #pragma acc loop vector(128) /* threadIdx.x */ Generating implicit reduction(min:targetVal) 194, #pragma acc loop vector(128) /* threadIdx.x */ Generating implicit reduction(max:targetVal) 173, Generating implicit copyin(params) [if not already present] Generating copyout(outData[:numCols+(((numInChannels-1)*outPitch)+((numTotRoi-1)*(numRows*outPitch)))]) [if not already present] 177, Generating implicit firstprivate(outPitch,targetVal,inChPitch) 183, Loop is parallelizable Generating implicit firstprivate(inPitch) 194, Loop is parallelizable /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccQHje_ctQ5izU.gpu -computecap 86 -ptx /tmp/nvacckHjeEWgSzr0l.ptx -o /tmp/nvaccAHjeo085QCwX.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_reduce.c -sm 86 /tmp/nvaccAHjeo085QCwX.bin -compute 86 /tmp/nvacckHjeEWgSzr0l.ptx -o /tmp/nvaccQHje_vJEBak7.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++1EjeFx3LD3eR.ll -S -o /tmp/nvc++DEjextsJxG0D.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++DEjextsJxG0D.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reduce.obj Unlinking /tmp/nvc++LEjeVCJeoSiB.il Unlinking /tmp/nvc++nEjeNcFV-2F6.s Unlinking /tmp/nvc++1EjeFx3LD3eR.ll Unlinking /tmp/nvc++DEjextsJxG0D.llvm compiling src/tidl_reshape.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_reshape.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++NRje1GVACMhE.il src/tidl_reshape.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_reshape.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_reshape.c -il /tmp/nvc++NRje1GVACMhE.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++NRje1GVACMhE.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reshape.obj' -asm /tmp/nvc++xRjefJinpYWB.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++xRjefJinpYWB.ll -S -o /tmp/nvc++pRjeT7SDvxox.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++pRjeT7SDvxox.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reshape.obj Unlinking /tmp/nvc++NRje1GVACMhE.il Unlinking /tmp/nvc++FRjeDNlaDTR0.s Unlinking /tmp/nvc++xRjefJinpYWB.ll Unlinking /tmp/nvc++pRjeT7SDvxox.llvm compiling src/tidl_resize.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_resize.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++PZje7yYPYAFo.il src/tidl_resize.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_resize.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_resize.c -il /tmp/nvc++PZje7yYPYAFo.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++PZje7yYPYAFo.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_resize.obj' -asm /tmp/nvc++5ZjeRIXXb-Cc.ll TIDL_resizeProcessSP(sTIDL_Layer_t*, void**, void*, sTIDL_DataParams_t**, sTIDL_DataParams_t*, unsigned char*): 422, Generating copyout(out[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present] Generating copyin(in[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+(inPitch*(inHeight-1)))]) [if not already present] 433, Generating implicit firstprivate(numBatches,outWidth,outHeight,numInChannels) Generating NVIDIA GPU code 433, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 435, /* blockIdx.x threadIdx.x collapsed */ 437, /* blockIdx.x threadIdx.x collapsed */ 439, /* blockIdx.x threadIdx.x collapsed */ 439, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inOffset,inHeight,outOffset,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth) 472, Generating implicit firstprivate(numBatches,outWidth,outHeight,numInChannels) Generating NVIDIA GPU code 472, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 474, /* blockIdx.x threadIdx.x collapsed */ 476, /* blockIdx.x threadIdx.x collapsed */ 478, /* blockIdx.x threadIdx.x collapsed */ 478, Generating implicit firstprivate(hNext,hIdx,inChPitch,hLoc,w10,w01,wNext,wRatio,wIdx,w11,w00,outPitch,inHeight,inOffset,outChPitch,outOffset,inPitch,wLoc,hRatio,inBatchPitch,outBatchPitch,inWidth) void TIDL_refResize(signed char*, signed char*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, sTIDL_ResizeLayerParams_t*, sTIDL_DataParams_t*, TIDL_CreateParams const*): 205, Generating copyin(pIn[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+((inHeight-1)*inPitch))]) [if not already present] Generating copy(pOut[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present] 216, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches) Generating NVIDIA GPU code 216, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 218, /* blockIdx.x threadIdx.x collapsed */ 220, /* blockIdx.x threadIdx.x collapsed */ 222, /* blockIdx.x threadIdx.x collapsed */ 222, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inHeight,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth) 265, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches) Generating NVIDIA GPU code 265, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 267, /* blockIdx.x threadIdx.x collapsed */ 269, /* blockIdx.x threadIdx.x collapsed */ 271, /* blockIdx.x threadIdx.x collapsed */ 265, Generating implicit copyin(params) [if not already present] 271, Generating implicit firstprivate(hNext,hIdx,enableHClip,hLoc,w10,w01,wNext,wIdx,w11,w00,outPitch,inChPitch,outChPitch,inPitch,wLoc,inBatchPitch,heightResizeRatio,hRatio,widthResizeRatio,wRatio,outBatchPitch,inWidth) void TIDL_refResize(unsigned char*, unsigned char*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, sTIDL_ResizeLayerParams_t*, sTIDL_DataParams_t*, TIDL_CreateParams const*): 205, Generating copyin(pIn[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+((inHeight-1)*inPitch))]) [if not already present] Generating copy(pOut[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present] 216, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches) Generating NVIDIA GPU code 216, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 218, /* blockIdx.x threadIdx.x collapsed */ 220, /* blockIdx.x threadIdx.x collapsed */ 222, /* blockIdx.x threadIdx.x collapsed */ 222, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inHeight,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth) 265, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches) Generating NVIDIA GPU code 265, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 267, /* blockIdx.x threadIdx.x collapsed */ 269, /* blockIdx.x threadIdx.x collapsed */ 271, /* blockIdx.x threadIdx.x collapsed */ 265, Generating implicit copyin(params) [if not already present] 271, Generating implicit firstprivate(hNext,hIdx,enableHClip,hLoc,w10,w01,wNext,wIdx,w11,w00,outPitch,inChPitch,outChPitch,inPitch,wLoc,inBatchPitch,heightResizeRatio,hRatio,widthResizeRatio,wRatio,outBatchPitch,inWidth) void TIDL_refResize(short*, short*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, sTIDL_ResizeLayerParams_t*, sTIDL_DataParams_t*, TIDL_CreateParams const*): 205, Generating copyin(pIn[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+((inHeight-1)*inPitch))]) [if not already present] Generating copy(pOut[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present] 216, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches) Generating NVIDIA GPU code 216, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 218, /* blockIdx.x threadIdx.x collapsed */ 220, /* blockIdx.x threadIdx.x collapsed */ 222, /* blockIdx.x threadIdx.x collapsed */ 222, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inHeight,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth) 265, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches) Generating NVIDIA GPU code 265, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 267, /* blockIdx.x threadIdx.x collapsed */ 269, /* blockIdx.x threadIdx.x collapsed */ 271, /* blockIdx.x threadIdx.x collapsed */ 265, Generating implicit copyin(params) [if not already present] 271, Generating implicit firstprivate(hNext,hIdx,enableHClip,hLoc,w10,w01,wNext,wIdx,w11,w00,outPitch,inChPitch,outChPitch,inPitch,wLoc,heightResizeRatio,inBatchPitch,hRatio,widthResizeRatio,wRatio,outBatchPitch,inWidth) void TIDL_refResize(unsigned short*, unsigned short*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, sTIDL_ResizeLayerParams_t*, sTIDL_DataParams_t*, TIDL_CreateParams const*): 205, Generating copyin(pIn[:((inChPitch*(numInChannels-1))+(inBatchPitch*(numBatches-1)))+(inWidth+((inHeight-1)*inPitch))]) [if not already present] Generating copy(pOut[:(((numInChannels-1)*outChPitch)+((numBatches-1)*outBatchPitch))+(outWidth+((outHeight-1)*outPitch))]) [if not already present] 216, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches) Generating NVIDIA GPU code 216, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 218, /* blockIdx.x threadIdx.x collapsed */ 220, /* blockIdx.x threadIdx.x collapsed */ 222, /* blockIdx.x threadIdx.x collapsed */ 222, Generating implicit firstprivate(hLoc,hIdx,inChPitch,wLoc,wRatio,wIdx,outPitch,inPitch,inHeight,outChPitch,inBatchPitch,hRatio,outBatchPitch,inWidth) 265, Generating implicit firstprivate(outWidth,outHeight,numInChannels,numBatches) Generating NVIDIA GPU code 265, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 267, /* blockIdx.x threadIdx.x collapsed */ 269, /* blockIdx.x threadIdx.x collapsed */ 271, /* blockIdx.x threadIdx.x collapsed */ 265, Generating implicit copyin(params) [if not already present] 271, Generating implicit firstprivate(hNext,hIdx,enableHClip,hLoc,w10,w01,wNext,wIdx,w11,w00,outPitch,inChPitch,outChPitch,inPitch,wLoc,heightResizeRatio,inBatchPitch,hRatio,widthResizeRatio,wRatio,outBatchPitch,inWidth) std::floor(float): 71, include "math.h" 15, include "math.h" 36, include "cmath" 15, include "cmath" 261, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code _INTERNAL_17_src_tidl_resize_c_33050338::TIDL_refResizeProcess(TIDL_CreateParams const*, sTIDL_AlgLayer_t*, sTIDL_Layer_t*, void**, void**, int): 582, Generating copy(inPtrOrig[:resizeInWidthBytes+((inPitchBytes*(inputHeight+1))+((inElmtSize*leftPadResize)+((resizeInChPitchBytes*(resizeNumChannels-1))+(inBatchPitch*(numBatches-1)))))]) [if not already present] Generating implicit firstprivate(numBatches,resizeNumChannels,resizeInWidthBytes) Generating NVIDIA GPU code 587, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 589, /* blockIdx.x threadIdx.x collapsed */ 591, /* blockIdx.x threadIdx.x collapsed */ 591, Generating implicit firstprivate(copyBottomLine,inBatchPitch,inElmtSize,inputHeight,resizeInChPitchBytes,leftPadResize,copyTopLine,inPitch,inPitchBytes) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccT2jehZBMwzdL.gpu -computecap 86 -ptx /tmp/nvacc92je3RlMMdtW.ptx -o /tmp/nvaccL2jeVkjknGlk.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_resize.c -sm 86 /tmp/nvaccL2jeVkjknGlk.bin -compute 86 /tmp/nvacc92je3RlMMdtW.ptx -o /tmp/nvaccn2jeNGYHuGhV.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++5ZjeRIXXb-Cc.ll -S -o /tmp/nvc++bZjed4DaBysa.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++bZjed4DaBysa.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_resize.obj Unlinking /tmp/nvc++PZje7yYPYAFo.il Unlinking /tmp/nvc++XZjetGLqZ340.s Unlinking /tmp/nvc++5ZjeRIXXb-Cc.ll Unlinking /tmp/nvc++bZjed4DaBysa.llvm compiling src/tidl_roiPooling.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_roiPooling.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++RakebfOcarEz.il src/tidl_roiPooling.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_roiPooling.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_roiPooling.c -il /tmp/nvc++RakebfOcarEz.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++RakebfOcarEz.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_roiPooling.obj' -asm /tmp/nvc++BakerwUVYxnY.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++BakerwUVYxnY.ll -S -o /tmp/nvc++ZakezibhzNkA.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++ZakezibhzNkA.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_roiPooling.obj Unlinking /tmp/nvc++RakebfOcarEz.il Unlinking /tmp/nvc++dakejTi5rs88.s Unlinking /tmp/nvc++BakerwUVYxnY.ll Unlinking /tmp/nvc++ZakezibhzNkA.llvm compiling src/tidl_scatterElements.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_scatterElements.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++ZikezUpqLNVx.il src/tidl_scatterElements.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_scatterElements.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_scatterElements.c -il /tmp/nvc++ZikezUpqLNVx.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++ZikezUpqLNVx.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_scatterElements.obj' -asm /tmp/nvc++JikePT5gjRTV.ll void TIDL_refScatterElementsOutputUpdation(short*, signed char*, int, int, int, short, short): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code void TIDL_refScatterElementsOutputUpdation(signed char*, signed char*, int, int, int, signed char, signed char): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code void TIDL_refScatterElementsOutputUpdation(unsigned short*, unsigned char*, int, int, int, unsigned short, unsigned short): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code void TIDL_refScatterElementsOutputUpdation(unsigned char*, unsigned char*, int, int, int, unsigned char, unsigned char): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code void TIDL_refScatterElementsOutputUpdation(int*, short*, int, int, int, int, int): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code void TIDL_refScatterElementsOutputUpdation(short*, short*, int, int, int, short, short): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code void TIDL_refScatterElementsOutputUpdation(unsigned int*, unsigned short*, int, int, int, unsigned int, unsigned int): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code void TIDL_refScatterElementsOutputUpdation(unsigned short*, unsigned short*, int, int, int, unsigned short, unsigned short): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code void TIDL_refScatterElementsOutputUpdation(float*, float*, int, int, int, float, float): 119, Generating implicit acc routine seq Generating acc routine seq Generating NVIDIA GPU code int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(signed char*, int*, signed char*, signed char*, short*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyout(accPtr[:cpSize]) [if not already present] Generating copyin(data[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(signed char*, int*, signed char*, signed char*, signed char*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,targetIndex,w,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(maxValueAcc,minValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(unsigned char*, int*, unsigned char*, unsigned char*, unsigned short*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,targetIndex,w,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyout(accPtr[:cpSize]) [if not already present] Generating copyin(data[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,status,updateVal,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(unsigned char*, int*, unsigned char*, unsigned char*, unsigned char*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,targetIndex,w,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyout(accPtr[:cpSize]) [if not already present] Generating copyin(data[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(maxValueAcc,minValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(short*, int*, short*, short*, int*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyout(accPtr[:cpSize]) [if not already present] Generating copyin(data[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(short*, int*, short*, short*, short*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(unsigned short*, int*, unsigned short*, unsigned short*, unsigned int*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyout(accPtr[:cpSize]) [if not already present] Generating copyin(data[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(unsigned short*, int*, unsigned short*, unsigned short*, unsigned short*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(float*, float*, float*, float*, float*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) int _INTERNAL_26_src_tidl_scatterElements_c_9c1616d6::TIDL_refScatterElements(float*, int*, float*, float*, float*, TIDL_Obj*, int, sTIDL_ScatterElementsParams_t*, sTIDL_AlgLayer_t*, sTIDL_Layer_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*, sTIDL_DataParams_t const*): 250, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,h,minValueAcc,c,w,targetIndex,n,outChPitch,outNumCols,outChs,outNumRows,outBatches,outPitch,outRoiPitch,maxValueAcc,inIndicesNumRows) Generating NVIDIA GPU code 250, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 250, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+4]) [if not already present] Generating implicit copyin(params,update) [if not already present] 275, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 279, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 285, Generating copyin(data[:cpSize]) [if not already present] Generating copyout(accPtr[:cpSize]) [if not already present] Generating implicit firstprivate(cpSize) Generating NVIDIA GPU code 288, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 297, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,h,indDim,dataDim,n,outChPitch,outChs,outNumRows,outBatches,outPitch,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 297, #pragma acc loop gang /* blockIdx.x */ 328, #pragma acc loop vector(128) /* threadIdx.x */ 297, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+3]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 328, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 341, Generating implicit copyin(accPtr) [if not already present] Generating implicit firstprivate(inIndicesNumCols,c,inIndicesNumRows,n,outChPitch,outBatches,outChs,outRoiPitch,updateIndex,targetIndex) Generating NVIDIA GPU code 341, #pragma acc loop gang /* blockIdx.x */ 356, #pragma acc loop vector(128) collapse(2) /* threadIdx.x */ 358, /* threadIdx.x collapsed */ 341, Generating copyin(indices[:inIndicesNumCols*(inIndicesNumRows-1)+2]) [if not already present] Generating implicit copyin(update,params,outDataParams) [if not already present] 356, Loop is parallelizable 358, Loop is parallelizable Generating implicit firstprivate(minValueAcc,maxValueAcc) 369, Generating copyin(accPtr[:outSize]) [if not already present] Generating copyout(output[:outSize]) [if not already present] 381, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 385, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 381, Generating implicit copyin(tidlLayer,outDataParams) [if not already present] 385, Generating implicit firstprivate(minValueOutput,shift,outAcc,maxValueOutput,cScale) 403, Generating implicit firstprivate(outSize) Generating NVIDIA GPU code 405, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 429, Generating copyout(data[:inIndicesNumCols+((inDataPitch*(inIndicesNumRows-1))+(((numInChannels-1)*inDataChPitch)+(numOutChannels*((numTotRoi-1)*inDataChPitch))))]) [if not already present] Generating copyin(update[:inIndicesNumCols+((inUpdatePitch*(inIndicesNumRows-1))+((inUpdateChPitch*(numInChannels-1))+(numInChannels*((numTotRoi-1)*inUpdateChPitch))))],indices[:inIndicesNumCols+((inIndicesPitch*(inIndicesNumRows-1))+(((inIndicesNumRows-1)*inIndicesChPitch)+(numInChannels*(inIndicesChPitch*(numTotRoi-1)))))]) [if not already present] Generating implicit firstprivate(inIndicesNumCols,inIndicesNumRows,numTotRoi,numInChannels) Generating NVIDIA GPU code 429, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 431, /* blockIdx.x threadIdx.x collapsed */ 433, /* blockIdx.x threadIdx.x collapsed */ 435, /* blockIdx.x threadIdx.x collapsed */ 435, Generating implicit firstprivate(inIndicesChPitch,inDataPitch,inUpdatePitch,inUpdateChPitch,inIndicesPitch,updateVal,status,inDataChPitch,axis,index,numOutChannels) 483, Generating copyout(output[:(outPitch*(outNumRows-1))+((outChPitch*(numOutChannels-1))+(numOutChannels*(outChPitch*(numTotRoi-1))))+1]) [if not already present] Generating copyin(data[:(outNumCols*(outNumRows-1))+((outNumRows*(outNumCols*(numOutChannels-1)))+(numOutChannels*(outNumRows*(outNumCols*(numTotRoi-1)))))+1]) [if not already present] Generating implicit firstprivate(numTotRoi,numOutChannels,outNumRows) Generating NVIDIA GPU code 483, #pragma acc loop gang, vector(128) collapse(3) /* blockIdx.x threadIdx.x */ 485, /* blockIdx.x threadIdx.x collapsed */ 487, /* blockIdx.x threadIdx.x collapsed */ 487, Generating implicit firstprivate(outPitch,outNumCols,outChPitch) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccvlke-0Mp_EK_.gpu -computecap 86 -ptx /tmp/nvaccLlkeVdjvoll7.ptx -o /tmp/nvaccnlkeNHXK1Kei.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_scatterElements.c -sm 86 /tmp/nvaccnlkeNHXK1Kei.bin -compute 86 /tmp/nvaccLlkeVdjvoll7.ptx -o /tmp/nvacc1lkeFUDL8kjy.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++JikePT5gjRTV.ll -S -o /tmp/nvc++7ikeXANBtCw_.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++7ikeXANBtCw_.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_scatterElements.obj Unlinking /tmp/nvc++ZikezUpqLNVx.il Unlinking /tmp/nvc++likeHHT3Chi_.s Unlinking /tmp/nvc++JikePT5gjRTV.ll Unlinking /tmp/nvc++7ikeXANBtCw_.llvm compiling src/tidl_shuffleChannel.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_shuffleChannel.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++gvkesqjKl2DY.il src/tidl_shuffleChannel.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_shuffleChannel.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_shuffleChannel.c -il /tmp/nvc++gvkesqjKl2DY.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++gvkesqjKl2DY.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_shuffleChannel.obj' -asm /tmp/nvc++MvkeYMTvSPoV.ll void TIDL_refShuffleChannel(float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int): 123, Generating copy(pOut[:((height-1)*outLinePitch)+((numGroups*((NiPerG-1)*outChPitch))+(((numGroups-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))+1]) [if not already present] Generating copyin(pIn[:(inLinePitch*(height-1))+((inChPitch*(NiPerG-1))+((NiPerG*(inChPitch*(numGroups-1)))+((inROIPitch*(numROIs-1))+inPtrOffset)))+1]) [if not already present] Generating implicit firstprivate(NiPerG,height,numROIs,numGroups) Generating NVIDIA GPU code 123, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 125, /* blockIdx.x threadIdx.x collapsed */ 127, /* blockIdx.x threadIdx.x collapsed */ 129, /* blockIdx.x threadIdx.x collapsed */ 129, Generating implicit firstprivate(inChPitch,elemSize,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outPtrOffset,width,outROIPitch) void TIDL_refShuffleChannel(unsigned char const*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, int): 123, Generating copy(pOut[:((height-1)*outLinePitch)+((numGroups*((NiPerG-1)*outChPitch))+(((numGroups-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))+1]) [if not already present] Generating copyin(pIn[:(inLinePitch*(height-1))+((inChPitch*(NiPerG-1))+((NiPerG*(inChPitch*(numGroups-1)))+((inROIPitch*(numROIs-1))+inPtrOffset)))+1]) [if not already present] Generating implicit firstprivate(NiPerG,height,numROIs,numGroups) Generating NVIDIA GPU code 123, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 125, /* blockIdx.x threadIdx.x collapsed */ 127, /* blockIdx.x threadIdx.x collapsed */ 129, /* blockIdx.x threadIdx.x collapsed */ 129, Generating implicit firstprivate(inChPitch,elemSize,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outPtrOffset,width,outROIPitch) void TIDL_refShuffleChannel(unsigned short const*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, int): 123, Generating copy(pOut[:((height-1)*outLinePitch)+((numGroups*((NiPerG-1)*outChPitch))+(((numGroups-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))+1]) [if not already present] Generating copyin(pIn[:(inLinePitch*(height-1))+((inChPitch*(NiPerG-1))+((NiPerG*(inChPitch*(numGroups-1)))+((inROIPitch*(numROIs-1))+inPtrOffset)))+1]) [if not already present] Generating implicit firstprivate(NiPerG,height,numROIs,numGroups) Generating NVIDIA GPU code 123, #pragma acc loop gang, vector(128) collapse(4) /* blockIdx.x threadIdx.x */ 125, /* blockIdx.x threadIdx.x collapsed */ 127, /* blockIdx.x threadIdx.x collapsed */ 129, /* blockIdx.x threadIdx.x collapsed */ 129, Generating implicit firstprivate(inChPitch,elemSize,inLinePitch,inPtrOffset,outChPitch,inROIPitch,outLinePitch,outPtrOffset,width,outROIPitch) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvacc6ykeUZw8jBKW.gpu -computecap 86 -ptx /tmp/nvaccAykeoD3YPxgS.ptx -o /tmp/nvaccQyke_gZ_6u5R.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_shuffleChannel.c -sm 86 /tmp/nvaccQyke_gZ_6u5R.bin -compute 86 /tmp/nvaccAykeoD3YPxgS.ptx -o /tmp/nvacc6ykeU_lsR9dI.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++MvkeYMTvSPoV.ll -S -o /tmp/nvc++wvkecDAolXg5.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++wvkecDAolXg5.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_shuffleChannel.obj Unlinking /tmp/nvc++gvkesqjKl2DY.il Unlinking /tmp/nvc++2vkeIstGHQ-l.s Unlinking /tmp/nvc++MvkeYMTvSPoV.ll Unlinking /tmp/nvc++wvkecDAolXg5.llvm compiling src/tidl_slice.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_slice.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++YIkewfBgboDR.il src/tidl_slice.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_slice.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_slice.c -il /tmp/nvc++YIkewfBgboDR.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++YIkewfBgboDR.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_slice.obj' -asm /tmp/nvc++sIke2WrAIh5F.ll void TIDL_refSlice(float const*, float*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 124, Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present] Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present] Generating implicit firstprivate(outHeight,outWidth,numROIs,numDim1,numChs,numDim2) Generating NVIDIA GPU code 124, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 126, /* blockIdx.x threadIdx.x collapsed */ 128, /* blockIdx.x threadIdx.x collapsed */ 130, /* blockIdx.x threadIdx.x collapsed */ 132, /* blockIdx.x threadIdx.x collapsed */ 134, /* blockIdx.x threadIdx.x collapsed */ 134, Generating implicit firstprivate(inDim2Pitch,inDim1Pitch,inChPitch,inLinePitch,inPtrOffset,outDim2Pitch,inROIPitch,outDim1Pitch,outChPitch,outLinePitch,outROIPitch,outPtrOffset) void TIDL_refSlice(unsigned char const*, unsigned char*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 124, Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present] Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present] Generating implicit firstprivate(outHeight,outWidth,numROIs,numDim1,numChs,numDim2) Generating NVIDIA GPU code 124, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 126, /* blockIdx.x threadIdx.x collapsed */ 128, /* blockIdx.x threadIdx.x collapsed */ 130, /* blockIdx.x threadIdx.x collapsed */ 132, /* blockIdx.x threadIdx.x collapsed */ 134, /* blockIdx.x threadIdx.x collapsed */ 134, Generating implicit firstprivate(inDim2Pitch,inDim1Pitch,inChPitch,inLinePitch,inPtrOffset,outDim2Pitch,inROIPitch,outDim1Pitch,outChPitch,outLinePitch,outROIPitch,outPtrOffset) void TIDL_refSlice(unsigned short const*, unsigned short*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int): 124, Generating copyin(pIn[:outWidth+((inLinePitch*(outHeight-1))+((inChPitch*(numChs-1))+((inROIPitch*(numROIs-1))+inPtrOffset)))]) [if not already present] Generating copy(pOut[:outWidth+(((outHeight-1)*outLinePitch)+(((numChs-1)*outChPitch)+(((numROIs-1)*outROIPitch)+outPtrOffset)))]) [if not already present] Generating implicit firstprivate(outHeight,outWidth,numROIs,numDim1,numChs,numDim2) Generating NVIDIA GPU code 124, #pragma acc loop gang, vector(128) collapse(6) /* blockIdx.x threadIdx.x */ 126, /* blockIdx.x threadIdx.x collapsed */ 128, /* blockIdx.x threadIdx.x collapsed */ 130, /* blockIdx.x threadIdx.x collapsed */ 132, /* blockIdx.x threadIdx.x collapsed */ 134, /* blockIdx.x threadIdx.x collapsed */ 134, Generating implicit firstprivate(inDim2Pitch,inDim1Pitch,inChPitch,inLinePitch,inPtrOffset,outDim2Pitch,inROIPitch,outDim1Pitch,outChPitch,outLinePitch,outROIPitch,outPtrOffset) /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -usenvvm -nvvm70 -reloc /tmp/nvaccOLke4r0CR1C0.gpu -computecap 86 -ptx /tmp/nvaccOLke4NkgR3Fq.ptx -o /tmp/nvaccOLke4MFRO0EW.bin -ftz -cuda12020 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/nvdd -dcuda /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -reloc -cuda12020 -fat src/tidl_slice.c -sm 86 /tmp/nvaccOLke4MFRO0EW.bin -compute 86 /tmp/nvaccOLke4NkgR3Fq.ptx -o /tmp/nvaccOLke4DCBjyvW.fat NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++sIke2WrAIh5F.ll -S -o /tmp/nvc++IIkeM3F9GyqY.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++IIkeM3F9GyqY.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_slice.obj Unlinking /tmp/nvc++YIkewfBgboDR.il Unlinking /tmp/nvc++cIkegWZ6yfxY.s Unlinking /tmp/nvc++sIke2WrAIh5F.ll Unlinking /tmp/nvc++IIkeM3F9GyqY.llvm compiling src/tidl_softmax.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_softmax.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++0VkeCHKPxaXr.il src/tidl_softmax.c "src/tidl_softmax.c", line 306: warning: variable "maxIndex" was set but never used [set_but_not_used] int32_t maxIndex = 0; ^ Remark: individual warnings can be suppressed with "--diag_suppress " /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_softmax.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_softmax.c -il /tmp/nvc++0VkeCHKPxaXr.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++0VkeCHKPxaXr.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_softmax.obj' -asm /tmp/nvc++0VkeCuoBug2A.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++0VkeCuoBug2A.ll -S -o /tmp/nvc++uVke83aFMJEE.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++uVke83aFMJEE.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_softmax.obj Unlinking /tmp/nvc++0VkeCHKPxaXr.il Unlinking /tmp/nvc++uVke8Sia_9gn.s Unlinking /tmp/nvc++0VkeCuoBug2A.ll Unlinking /tmp/nvc++uVke83aFMJEE.llvm compiling src/tidl_squeeze.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_squeeze.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++43keOrNw7iqF.il src/tidl_squeeze.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_squeeze.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_squeeze.c -il /tmp/nvc++43keOrNw7iqF.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++43keOrNw7iqF.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_squeeze.obj' -asm /tmp/nvc++43keOtTF4jIB.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++43keOtTF4jIB.ll -S -o /tmp/nvc++43keOWx5QnI0.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++43keOWx5QnI0.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_squeeze.obj Unlinking /tmp/nvc++43keOrNw7iqF.il Unlinking /tmp/nvc++43keOEHNYObI.s Unlinking /tmp/nvc++43keOtTF4jIB.ll Unlinking /tmp/nvc++43keOWx5QnI0.llvm compiling src/tidl_transpose.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/tidl_transpose.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++7-keXrgkqPA7.il src/tidl_transpose.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/tidl_transpose.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/tidl_transpose.c -il /tmp/nvc++7-keXrgkqPA7.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++7-keXrgkqPA7.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_transpose.obj' -asm /tmp/nvc++R-kebF8Cco9g.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++R-kebF8Cco9g.ll -S -o /tmp/nvc++d-kejxaMj5QL.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++d-kejxaMj5QL.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_transpose.obj Unlinking /tmp/nvc++7-keXrgkqPA7.il Unlinking /tmp/nvc++t-ke5FyeHmHk.s Unlinking /tmp/nvc++R-kebF8Cco9g.ll Unlinking /tmp/nvc++d-kejxaMj5QL.llvm compiling src/workload_ref_exec.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk src/workload_ref_exec.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I./inc -I./inc/dummy -I../inc -I../../common -I../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++_hle6PX8ULVx.il src/workload_ref_exec.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 src/workload_ref_exec.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn src/workload_ref_exec.c -il /tmp/nvc++_hle6PX8ULVx.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 0 0x1000000 -x 2 0x100000 -x 0 0x2000000 -x 161 16384 -x 162 16384 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++_hle6PX8ULVx.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -Minfo=accel -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ./inc -I ./inc/dummy -I ../inc -I ../../common -I ../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/workload_ref_exec.obj' -asm /tmp/nvc++EhleAc2ypyzY.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++EhleAc2ypyzY.ll -S -o /tmp/nvc++ohleQvMgT4jT.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++ohleQvMgT4jT.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/workload_ref_exec.obj Unlinking /tmp/nvc++_hle6PX8ULVx.il Unlinking /tmp/nvc++Uhlek5_zv0hJ.s Unlinking /tmp/nvc++EhleAc2ypyzY.ll Unlinking /tmp/nvc++ohleQvMgT4jT.llvm r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/printv.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_alg_utils.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_argmax.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchNorm.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_batchReshape.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_colorConversion.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_commonUtils.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_concat.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_const.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_conv2d_base.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_crop.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_custom_int.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_dataConvert.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_deconv2d.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_depthToSpace.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_detectionOutput_score.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_device_functions.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_eltWise.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_flatten.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_function_mapping.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_gatherLayer.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_innerProduct.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_layerNorm.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_odOutputReformat.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pad.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_pooling.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_preEmption.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reduce.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_reshape.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_resize.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_roiPooling.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_scatterElements.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_shuffleChannel.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_slice.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_softmax.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_squeeze.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/tidl_transpose.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/./src/workload_ref_exec.obj make[1]: Leaving directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo' make -C ./ti_dl/algo/src/avx -f makefile make[1]: Entering directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo/src/avx' compiling tidl_avx.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidl_avx.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__AVX2__ -D__FMA__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../../inc -I../../../inc -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/packages -I/packages/ti/mathlib -I../../../../common -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/inc -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I../../../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv -I../../../../common -I../../../../common/c6xsim --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++QFle_E0oZnQy.il tidl_avx.c "tidl_avx.c", line 306: warning: integer conversion resulted in a change of sign [integer_sign_change] mask[j]=1<<31; ^ Remark: individual warnings can be suppressed with "--diag_suppress " "tidl_avx.c", line 237: warning: variable "i2" was declared but never referenced [declared_but_not_referenced] int32_t i0, i2, i3, i4, i5, i6, i7, j; ^ "tidl_avx.c", line 242: warning: variable "inData" was declared but never referenced [declared_but_not_referenced] float inData; ^ "tidl_avx.c", line 243: warning: variable "acc" was declared but never referenced [declared_but_not_referenced] float acc; ^ "tidl_avx.c", line 246: warning: variable "lmin" was declared but never referenced [declared_but_not_referenced] float lmin = *min; ^ "tidl_avx.c", line 247: warning: variable "lmax" was declared but never referenced [declared_but_not_referenced] float lmax = *max; ^ "tidl_avx.c", line 168: warning: integer conversion resulted in a change of sign [integer_sign_change] mask[j]=1<<31; ^ "tidl_avx.c", line 99: warning: variable "i2" was declared but never referenced [declared_but_not_referenced] int32_t i0, i2, i3, i4, i5, i6, i7, j; ^ "tidl_avx.c", line 104: warning: variable "inData" was declared but never referenced [declared_but_not_referenced] float inData; ^ "tidl_avx.c", line 105: warning: variable "acc" was declared but never referenced [declared_but_not_referenced] float acc; ^ "tidl_avx.c", line 108: warning: variable "lmin" was declared but never referenced [declared_but_not_referenced] float lmin = *min; ^ "tidl_avx.c", line 109: warning: variable "lmax" was declared but never referenced [declared_but_not_referenced] float lmax = *max; ^ "tidl_avx.c", line 1555: warning: variable "rem" was declared but never referenced [declared_but_not_referenced] int rem = (inWidth)%8; //Leftovers ^ "tidl_avx.c", line 1534: warning: variable "outOffset" was declared but never referenced [declared_but_not_referenced] int outOffset,outAcc; ^ "tidl_avx.c", line 1534: warning: variable "outAcc" was declared but never referenced [declared_but_not_referenced] int outOffset,outAcc; ^ "tidl_avx.c", line 1535: warning: variable "i3" was declared but never referenced [declared_but_not_referenced] int i1,i2,i3,ik; ^ "tidl_avx.c", line 1535: warning: variable "ik" was declared but never referenced [declared_but_not_referenced] int i1,i2,i3,ik; ^ "tidl_avx.c", line 1419: warning: variable "i3" was declared but never referenced [declared_but_not_referenced] int i1,i2,i3; ^ "tidl_avx.c", line 376: warning: variable "i2" was declared but never referenced [declared_but_not_referenced] int32_t i0, i2, i3, i4, i5, i6, i7, j; ^ detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 719: warning: variable "i2" was declared but never referenced [declared_but_not_referenced] int32_t i0, i2, i3, i4, i5, i6, i7, j; ^ detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1652 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 1067: warning: variable "i2" was declared but never referenced [declared_but_not_referenced] int32_t i0, i2, i3, i4, i5, i6, i7, j; ^ detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1657 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1670 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1675 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 548: warning: variable "i2" was declared but never referenced [declared_but_not_referenced] int32_t i0, i2, i3, i4, i5, i6, i7, j; ^ detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 894: warning: variable "i2" was declared but never referenced [declared_but_not_referenced] int32_t i0, i2, i3, i4, i5, i6, i7, j; ^ detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1689 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 1242: warning: variable "i2" was declared but never referenced [declared_but_not_referenced] int32_t i0, i2, i3, i4, i5, i6, i7, j; ^ detected during instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1694 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1727 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1652 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1657 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1670 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1675 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745 "tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1689 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745 "tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1694 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1745 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1652 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1657 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1670 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1675 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837 "tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1689 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837 "tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1694 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1837 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1652 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1657 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1670 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1675 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855 "tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1689 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855 "tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1694 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int16_t, Tacc=int32_t]" at line 1855 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1652 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1657 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1670 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1675 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875 "tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1689 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875 "tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1694 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1875 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1652 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1657 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1670 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1675 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894 "tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1689 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894 "tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1694 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint8_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1894 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1652 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1657 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1670 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1675 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913 "tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1689 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913 "tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1694 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=uint16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1913 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1652 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=1, Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1657 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932 "tidl_avx.c", line 795: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1670 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932 "tidl_avx.c", line 1143: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bit(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Ksize=3, Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1675 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932 "tidl_avx.c", line 970: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1689 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932 "tidl_avx.c", line 1318: warning: shift count is too large [shift_count_too_large] mask[j]=((Tacc)1)<<63; ^ detected during: instantiation of "void TIDL_refConv2dKernelAvxIn8bitProc16bitGen(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1694 instantiation of "void TIDL_refConv2dKernelAvxProc(Tin *, Tw *, Tb *, Tacc *, Tacc *, Tacc *, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t, int32_t) [with Tin=int16_t, Tw=int8_t, Tb=int32_t, Tacc=int32_t]" at line 1932 /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_avx.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_avx.c -il /tmp/nvc++QFle_E0oZnQy.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++QFle_E0oZnQy.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../../inc -I ../../../inc -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /packages -I /packages/ti/mathlib -I ../../../../common -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/inc -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I ../../../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -I ../../../../common -I ../../../../common/c6xsim -mavx -msse4.2 -msse4.1 -mssse3 -msse3 -msse2 -msse -mxsave -mavx2 -mavx -msse4.2 -msse4.1 -mssse3 -msse3 -msse2 -msse -mxsave -mfma -mavx -msse4.2 -msse4.1 -mssse3 -msse3 -msse2 -msse -mxsave -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/avx/./tidl_avx.obj' -asm /tmp/nvc++kFleEmB6qmN1.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++kFleEmB6qmN1.ll -S -o /tmp/nvc++AFleoW068JOW.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++AFleoW068JOW.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+avx2 -mattr=+fma -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/avx/./tidl_avx.obj Unlinking /tmp/nvc++QFle_E0oZnQy.il Unlinking /tmp/nvc++6FleUwyIET-r.s Unlinking /tmp/nvc++kFleEmB6qmN1.ll Unlinking /tmp/nvc++AFleoW068JOW.llvm r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/avx/./tidl_avx.obj make[1]: Leaving directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo/src/avx' . ======== MAKING TIDL PRIV ALGO ================= make -C ./ti_dl/algo/src/priv -f makefile make[1]: Entering directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo/src/priv' compiling tidl_model_play.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidl_model_play.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../../inc -I../../../inc -I../../../../common -I../../../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++M5leY_tMRCOB.il tidl_model_play.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_model_play.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_model_play.c -il /tmp/nvc++M5leY_tMRCOB.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++M5leY_tMRCOB.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../../inc -I ../../../inc -I ../../../../common -I ../../../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_model_play.obj' -asm /tmp/nvc++g5lesDsL2WKa.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++g5lesDsL2WKa.ll -S -o /tmp/nvc++25leIhMkOwHC.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++25leIhMkOwHC.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_model_play.obj Unlinking /tmp/nvc++M5leY_tMRCOB.il Unlinking /tmp/nvc++w5lec1ZJl4i-.s Unlinking /tmp/nvc++g5lesDsL2WKa.ll Unlinking /tmp/nvc++25leIhMkOwHC.llvm compiling tidl_stalgo.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidl_stalgo.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../../inc -I../../../inc -I../../../../common -I../../../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++FbmeD7_JEsuG.il tidl_stalgo.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_stalgo.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_stalgo.c -il /tmp/nvc++FbmeD7_JEsuG.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++FbmeD7_JEsuG.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../../inc -I ../../../inc -I ../../../../common -I ../../../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo.obj' -asm /tmp/nvc++pbmeTpPXpitt.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++pbmeTpPXpitt.ll -S -o /tmp/nvc++hbmevntIRNr-.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++hbmevntIRNr-.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo.obj Unlinking /tmp/nvc++FbmeD7_JEsuG.il Unlinking /tmp/nvc++xbmefxwpAIlC.s Unlinking /tmp/nvc++pbmeTpPXpitt.ll Unlinking /tmp/nvc++hbmevntIRNr-.llvm compiling tidl_stalgo_workload.c Export PGI_CURR_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 Export NVHPC_CURRENT_CUDA_VERSION=12.2.53 Export NVCOMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/23.7 Export PGI=/opt/nvidia/hpc_sdk tidl_stalgo_workload.c: /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp1 --llalign -Dunix -D__unix -D__unix__ -Dlinux -D__linux -D__linux__ -D__NO_MATH_INLINES -D__LP64__ -D__x86_64 -D__x86_64__ -D__LONG_MAX__=9223372036854775807L '-D__SIZE_TYPE__=unsigned long int' '-D__PTRDIFF_TYPE__=long int' -D__amd64 -D__amd64__ -D__k8 -D__k8__ -D__MMX__ -D__SSE__ -D__SSE2__ -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__ -D__AVX__ -D__XSAVE__ -D__XSAVEOPT__ -D__POPCNT__ -D__AES__ -D__PCLMUL__ -D__PGI -D__NVCOMPILER -D_GNU_SOURCE -D_PGCG_SOURCE --c++14 -I- -I/home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I/usr/local/include -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I./source -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I. -I../../inc -I../../../inc -I../../../../common -I../../../utils/perfsim -Isrc/tidsp/inc -Isrc/tidsp/inc_priv --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/include-stdexec --sys_include /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2/include --sys_include /usr/include/c++/11 --sys_include /usr/include/x86_64-linux-gnu/c++/11 --sys_include /usr/include/c++/11/backward --sys_include /usr/lib/gcc/x86_64-linux-gnu/11/include --sys_include /usr/local/include --sys_include /usr/include/x86_64-linux-gnu --sys_include /usr/include -D__PGLLVM__ -D__NVCOMPILER_LLVM__ -D__extension__= -D_ACCEL=201003 -D_OPENACC=201711 -DCUDA_VERSION=12020 -DPGI_TESLA_TARGET -D__C7X_UNSTABLE_API -D__C7120__ -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -D__PIC__ --preinclude _cplus_preinclude.h --preinclude_macros _cplus_macros.h --gnu_version=110400 -D__pgnu_vsn=110400 --no_fixed_bp --accel --preinclude openacc_predef.h -D_NVHPC_RDC -q -o /tmp/nvc++Njme1YbcLACW.il tidl_stalgo_workload.c /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/bin/tools/cpp2 tidl_stalgo_workload.c -opt 3 -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 -x 127 4 -x 127 17 -x 19 0x400000 -x 28 0x40000 -x 120 0x10000000 -x 70 0x8000 -x 122 1 -x 125 0x20000 -quad -vect 56 -y 34 16 -x 37 0x480000 -x 34 0x8 -y 19 8 -y 35 0 -x 42 0x30 -x 39 0x40 -x 199 10 -x 39 0x80 -x 59 4 -tp px -x 120 0x1000 -astype 0 -x 121 1 -fn tidl_stalgo_workload.c -il /tmp/nvc++Njme1YbcLACW.il -x 117 0x200 -x 123 0x80000000 -x 123 4 -x 119 0x20 -def __pgnu_vsn=110400 -x 70 0x40000000 -x 183 4 -x 121 0x800 -x 6 0x20000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -autoinl 10 -x 168 400 -x 174 128000 -x 14 0x200000 -x 14 0x400000 -x 249 160 -x 120 0x200000 -x 70 0x40000000 -x 8 0x40000000 -x 164 0x800000 -x 71 0x2000 -x 71 0x4000 -x 34 0x40000000 -x 83 0x1 -x 85 0x1 -x 15 0x1000000 -x 15 0x4 -x 206 0x02 -x 120 0x1000000 -x 73 0x04 -x 68 0x1 -x 39 4 -x 56 0x10 -x 26 0x10 -x 26 1 -x 56 0x4000 -accel tesla -accel host -x 197 0 -x 175 0 -x 203 0 -x 204 0 -x 180 0x4000400 -x 121 0xc00 -x 186 0x80 -x 180 0x4000400 -x 121 0xc00 -x 194 0x40000 -x 163 0x1 -x 186 0x80000 -cudaver 12020 -x 176 0x100 -cudacap 86 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 176 0x100 -cudacap 86 -x 189 0x8000 -y 163 0xc0000000 -x 189 0x10 -y 189 0x4000000 -cudaroot /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/cuda/12.2 -x 187 0x40000 -x 187 0x8000000 -x 60 512 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 129 2 -y 129 0x8000 -quad -x 119 0x10000000 -x 129 0x40000000 -x 56 0x2 -x 9 1 -x 42 0x14200000 -x 72 0x1 -x 136 0x11 -x 62 8 -gnuvsn 110400 -x 69 0x200 -x 123 0x400 -cmdline '+nvc++ /tmp/nvc++Njme1YbcLACW.il -tp=px -c -fast -Mvect=simd -Mflushz -Mcache_align -Mno-signed-zeros -acc -gpu=cc86 -v -D__C7X_UNSTABLE_API -D__C7120__ -std=c++14 -O3 -Mvect=simd -Mflushz -Mcache_align -Mrecip-div -Mfactorize -Mno-signed-zeros -DHOST_EMULATION -D_HOST_BUILD -DGCC_BUILD -DCORE_DSP -D_TMS320C6600 -DLITTLE_ENDIAN_HOST -DSOC_J721S2 -DBUILD_WITH_OPENACC -D__C7X_HOSTEM__ -DGLIBCXX_CHECK_MATH11_PROTO -D_FILE_OFFSET_BITS=64 -D_TMS320C6600 -I /home/seunghun/ti/ti-cgt-c7000_4.1.0.LTS/host_emulation/include/C7120/ -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/cnn_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/mmalib_09_02_00_08/ti/mmalib/src/linalg_c7xmma -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/ivision/ti/xdais -I /usr/local/include -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages -I ./source -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils -I /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/pdk_j721s2_09_02_00_30/packages/ti/drv/udma/dmautils/udma_standalone -I . -I ../../inc -I ../../../inc -I ../../../../common -I ../../../utils/perfsim -I src/tidsp/inc -I src/tidsp/inc_priv -fPIC -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo_workload.obj' -asm /tmp/nvc++xjmefvr7oMka.ll NVC++/x86-64 Linux 23.7-0: compilation successful /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/opt '-passes=default' -opaque-pointers -slp-vectorize-hor=true -override-aa-for-tbaa=true -mcpu=x86-64 /tmp/nvc++xjmefvr7oMka.ll -S -o /tmp/nvc++pjmeT978iBlr.llvm /opt/nvidia/hpc_sdk/Linux_x86_64/23.7/compilers/share/llvm/bin/llc /tmp/nvc++pjmeT978iBlr.llvm -march=x86-64 -mcpu=x86-64 -mattr=+mmx -mattr=+sse -mattr=+sse2 -mattr=+sse3 -mattr=+ssse3 -mattr=+sse4.1 -mattr=+sse4.2 -mattr=+avx -mattr=+xsave -mattr=+xsaveopt -mattr=+popcnt -mattr=+aes -mattr=+pclmul -O3 -opaque-pointers -non-global-value-max-name-size=4294967295 -x86-cmov-converter=0 -dwarf-directory=false --align-all-functions=6 -override-aa-for-tbaa=true -relocation-model=pic -filetype=obj --frame-pointer=none -o /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo_workload.obj Unlinking /tmp/nvc++Njme1YbcLACW.il Unlinking /tmp/nvc++FjmeDjWHDfP-.s Unlinking /tmp/nvc++xjmefvr7oMka.ll Unlinking /tmp/nvc++pjmeT978iBlr.llvm r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_model_play.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo.obj r - /home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/out/PC/dsp/algo/release/ti_dl/algo/src/priv/./tidl_stalgo_workload.obj make[1]: Leaving directory '/home/seunghun/strad/svnet3/src_tda4x/platforms/92_j721s2/c7x-mma-tidl/ti_dl/algo/src/priv' . ======== MAKING TIDL AND CUSTOM LIBRARIES =================