Running causal_conv1d benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 460.509us 2386.43% 460.509us 460.509us 1
+ torch_eager 10.46% 229.787us 99.65% 2.189ms 2.189ms 0.000us 0.00% 21.633us 21.633us 1
+ aten::to 0.59% 12.913us 79.38% 1.743ms 290.578us 0.000us 0.00% 14.272us 2.379us 6
+ aten::_to_copy 1.99% 43.750us 78.79% 1.731ms 288.426us 0.000us 0.00% 14.272us 2.379us 6
+ aten::copy_ 2.89% 63.562us 74.16% 1.629ms 271.469us 11.936us 61.85% 14.272us 2.379us 6
+ aten::conv1d 0.44% 9.671us 7.66% 168.306us 56.102us 0.000us 0.00% 7.361us 2.454us 3
+ aten::convolution 0.72% 15.890us 7.22% 158.635us 52.878us 0.000us 0.00% 7.361us 2.454us 3
+ aten::_convolution 1.69% 37.102us 6.50% 142.745us 47.582us 0.000us 0.00% 7.361us 2.454us 3
+ aten::_conv_depthwise2d 1.60% 35.230us 3.77% 82.773us 27.591us 7.361us 38.15% 7.361us 2.454us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 38.15% 7.361us 2.454us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.50% 6.272us 2.091us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.35% 5.664us 1.888us 3
+ Activity Buffer Request 68.26% 1.499ms 68.26% 1.499ms 1.499ms 2.336us 12.11% 2.336us 2.336us 1
+ aten::empty_strided 2.64% 57.992us 2.64% 57.992us 9.665us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 4.12% 90.443us 4.12% 90.443us 10.049us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.47% 32.392us 1.88% 41.212us 4.579us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.64% 14.011us 0.64% 14.011us 0.934us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 12.120us 0.55% 12.120us 4.040us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.50% 10.961us 0.50% 10.961us 3.654us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.43% 9.410us 0.51% 11.220us 3.740us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.196ms
+Self CUDA time total: 19.297us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.557us 1795.89% 350.557us 350.557us 1
+ torch_eager 6.82% 130.236us 99.71% 1.905ms 1.905ms 0.000us 0.00% 21.632us 21.632us 1
+ aten::to 0.35% 6.597us 84.97% 1.623ms 270.580us 0.000us 0.00% 13.728us 2.288us 6
+ aten::_to_copy 1.27% 24.323us 84.63% 1.617ms 269.481us 0.000us 0.00% 13.728us 2.288us 6
+ aten::copy_ 2.68% 51.130us 81.67% 1.560ms 260.072us 11.616us 59.51% 13.728us 2.288us 6
+ aten::conv1d 0.33% 6.400us 6.43% 122.914us 40.971us 0.000us 0.00% 7.904us 2.635us 3
+ aten::convolution 0.52% 9.901us 6.10% 116.514us 38.838us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_convolution 1.28% 24.410us 5.58% 106.613us 35.538us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_conv_depthwise2d 1.25% 23.932us 3.35% 63.983us 21.328us 7.904us 40.49% 7.904us 2.635us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.49% 7.904us 2.635us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.15% 6.080us 2.027us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.536us 28.36% 5.536us 1.845us 3
+ Activity Buffer Request 76.19% 1.456ms 76.19% 1.456ms 1.456ms 2.112us 10.82% 2.112us 2.112us 1
+ aten::empty_strided 1.68% 32.131us 1.68% 32.131us 5.355us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.93% 75.003us 3.93% 75.003us 8.334us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.97% 18.540us 1.29% 24.620us 2.736us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.51% 9.711us 0.51% 9.711us 0.647us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.51% 9.650us 0.51% 9.650us 3.217us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.47% 9.000us 0.47% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.37% 7.100us 0.45% 8.560us 2.853us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.911ms
+Self CUDA time total: 19.520us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 379.390us 2047.55% 379.390us 379.390us 1
+ torch_eager 8.20% 159.835us 99.65% 1.942ms 1.942ms 0.000us 0.00% 20.449us 20.449us 1
+ aten::to 0.37% 7.179us 83.32% 1.624ms 270.686us 0.000us 0.00% 13.536us 2.256us 6
+ aten::_to_copy 1.40% 27.213us 82.96% 1.617ms 269.489us 0.000us 0.00% 13.536us 2.256us 6
+ aten::copy_ 2.62% 51.160us 79.92% 1.558ms 259.635us 11.616us 62.69% 13.536us 2.256us 6
+ aten::conv1d 0.34% 6.560us 6.49% 126.453us 42.151us 0.000us 0.00% 6.913us 2.304us 3
+ aten::convolution 0.57% 11.119us 6.15% 119.893us 39.964us 0.000us 0.00% 6.913us 2.304us 3
+ aten::_convolution 1.29% 25.191us 5.58% 108.774us 36.258us 0.000us 0.00% 6.913us 2.304us 3
+ aten::_conv_depthwise2d 1.16% 22.580us 3.36% 65.502us 21.834us 6.913us 37.31% 6.913us 2.304us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.913us 37.31% 6.913us 2.304us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 31.95% 5.920us 1.973us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.74% 5.696us 1.899us 3
+ Activity Buffer Request 74.82% 1.458ms 74.82% 1.458ms 1.458ms 1.920us 10.36% 1.920us 1.920us 1
+ aten::empty_strided 1.64% 31.911us 1.64% 31.911us 5.319us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 3.59% 70.043us 3.59% 70.043us 7.783us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.01% 19.612us 1.35% 26.392us 2.932us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.55% 10.750us 0.55% 10.750us 0.717us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.62% 12.182us 0.62% 12.182us 4.061us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.46% 8.910us 0.46% 8.910us 2.970us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.35% 6.890us 0.42% 8.260us 2.753us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.949ms
+Self CUDA time total: 18.529us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.058us 1736.41% 340.058us 340.058us 1
+ torch_eager 6.15% 129.375us 99.74% 2.097ms 2.097ms 0.000us 0.00% 21.760us 21.760us 1
+ aten::to 0.32% 6.700us 86.45% 1.818ms 303.002us 0.000us 0.00% 14.112us 2.352us 6
+ aten::_to_copy 1.17% 24.651us 86.13% 1.811ms 301.886us 0.000us 0.00% 14.112us 2.352us 6
+ aten::copy_ 2.42% 50.883us 83.54% 1.757ms 292.785us 11.936us 60.95% 14.112us 2.352us 6
+ aten::conv1d 0.30% 6.290us 5.74% 120.803us 40.268us 0.000us 0.00% 7.648us 2.549us 3
+ aten::convolution 0.48% 10.020us 5.45% 114.513us 38.171us 0.000us 0.00% 7.648us 2.549us 3
+ aten::_convolution 1.15% 24.209us 4.97% 104.493us 34.831us 0.000us 0.00% 7.648us 2.549us 3
+ aten::_conv_depthwise2d 1.00% 21.080us 2.93% 61.691us 20.564us 7.648us 39.05% 7.648us 2.549us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.648us 39.05% 7.648us 2.549us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 31.70% 6.208us 2.069us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.25% 5.728us 1.909us 3
+ Activity Buffer Request 71.15% 1.496ms 71.15% 1.496ms 1.496ms 2.176us 11.11% 2.176us 2.176us 1
+ aten::empty_strided 1.42% 29.951us 1.42% 29.951us 4.992us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.98% 230.807us 10.98% 230.807us 25.645us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.94% 19.863us 1.21% 25.543us 2.838us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.46% 9.630us 0.46% 9.630us 0.642us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.50% 10.541us 0.50% 10.541us 3.514us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.42% 8.810us 0.42% 8.810us 2.937us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.35% 7.411us 0.44% 9.201us 3.067us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.103ms
+Self CUDA time total: 19.584us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.070us 1381.53% 339.070us 339.070us 1
+ torch_eager 6.44% 132.135us 99.72% 2.045ms 2.045ms 0.000us 0.00% 26.814us 26.814us 1
+ aten::to 0.33% 6.722us 86.08% 1.765ms 294.155us 0.000us 0.00% 15.262us 2.544us 6
+ aten::_to_copy 1.20% 24.702us 85.75% 1.758ms 293.035us 0.000us 0.00% 15.262us 2.544us 6
+ aten::copy_ 2.39% 49.030us 83.04% 1.702ms 283.750us 12.991us 52.93% 15.262us 2.544us 6
+ aten::conv1d 0.29% 5.850us 5.78% 118.603us 39.534us 0.000us 0.00% 11.552us 3.851us 3
+ aten::convolution 0.55% 11.220us 5.50% 112.753us 37.584us 0.000us 0.00% 11.552us 3.851us 3
+ aten::_convolution 1.18% 24.170us 4.95% 101.533us 33.844us 0.000us 0.00% 11.552us 3.851us 3
+ aten::_conv_depthwise2d 1.08% 22.212us 2.99% 61.273us 20.424us 11.552us 47.07% 11.552us 3.851us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.07% 11.552us 3.851us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.655us 27.12% 6.655us 2.218us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 25.82% 6.336us 2.112us 3
+ Activity Buffer Request 71.25% 1.461ms 71.25% 1.461ms 1.461ms 2.271us 9.25% 2.271us 2.271us 1
+ aten::empty_strided 1.51% 31.010us 1.51% 31.010us 5.168us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.41% 213.527us 10.41% 213.527us 23.725us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 18.350us 1.15% 23.660us 2.629us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.131us 0.45% 9.131us 0.609us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.46% 9.481us 0.46% 9.481us 3.160us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.43% 8.760us 0.43% 8.760us 2.920us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.27% 5.520us 0.33% 6.850us 2.283us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.050ms
+Self CUDA time total: 24.543us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.129us 1305.15% 339.129us 339.129us 1
+ torch_eager 6.29% 128.886us 99.74% 2.043ms 2.043ms 0.000us 0.00% 28.224us 28.224us 1
+ aten::to 0.34% 6.902us 86.10% 1.763ms 293.882us 0.000us 0.00% 15.168us 2.528us 6
+ aten::_to_copy 1.23% 25.190us 85.76% 1.756ms 292.731us 0.000us 0.00% 15.168us 2.528us 6
+ aten::copy_ 2.41% 49.270us 83.08% 1.701ms 283.571us 12.928us 49.75% 15.168us 2.528us 6
+ aten::conv1d 0.31% 6.370us 5.92% 121.333us 40.444us 0.000us 0.00% 13.056us 4.352us 3
+ aten::convolution 0.49% 10.120us 5.61% 114.963us 38.321us 0.000us 0.00% 13.056us 4.352us 3
+ aten::_convolution 1.25% 25.500us 5.12% 104.843us 34.948us 0.000us 0.00% 13.056us 4.352us 3
+ aten::_conv_depthwise2d 1.08% 22.212us 3.04% 62.243us 20.748us 13.056us 50.25% 13.056us 4.352us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.056us 50.25% 13.056us 4.352us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 25.37% 6.592us 2.197us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.38% 6.336us 2.112us 3
+ Activity Buffer Request 71.41% 1.463ms 71.41% 1.463ms 1.463ms 2.240us 8.62% 2.240us 2.240us 1
+ aten::empty_strided 1.45% 29.770us 1.45% 29.770us 4.962us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 10.25% 209.968us 10.25% 209.968us 23.330us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 18.870us 1.21% 24.780us 2.753us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.47% 9.601us 0.47% 9.601us 0.640us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.51% 10.510us 0.51% 10.510us 3.503us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.45% 9.181us 0.45% 9.181us 3.060us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 6.640us 0.40% 8.140us 2.713us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.048ms
+Self CUDA time total: 25.984us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 362.270us 942.63% 362.270us 362.270us 1
+ torch_eager 7.50% 163.876us 99.75% 2.180ms 2.180ms 0.000us 0.00% 40.993us 40.993us 1
+ aten::conv1d 0.34% 7.388us 5.94% 129.794us 43.265us 0.000us 0.00% 22.464us 7.488us 3
+ aten::convolution 0.56% 12.301us 5.60% 122.406us 40.802us 0.000us 0.00% 22.464us 7.488us 3
+ aten::_convolution 1.18% 25.829us 5.04% 110.105us 36.702us 0.000us 0.00% 22.464us 7.488us 3
+ aten::_conv_depthwise2d 1.07% 23.371us 2.94% 64.311us 21.437us 22.464us 58.45% 22.464us 7.488us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.464us 58.45% 22.464us 7.488us 3
+ aten::to 0.36% 7.830us 84.95% 1.856ms 309.406us 0.000us 0.00% 18.529us 3.088us 6
+ aten::_to_copy 1.44% 31.560us 84.59% 1.849ms 308.101us 0.000us 0.00% 18.529us 3.088us 6
+ aten::copy_ 2.41% 52.633us 81.64% 1.784ms 297.326us 15.968us 41.55% 18.529us 3.088us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.609us 22.40% 8.609us 2.870us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.15% 7.359us 2.453us 3
+ Activity Buffer Request 65.39% 1.429ms 65.39% 1.429ms 1.429ms 2.561us 6.66% 2.561us 2.561us 1
+ aten::empty_strided 1.51% 33.091us 1.51% 33.091us 5.515us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 14.87% 325.052us 14.87% 325.052us 36.117us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.00% 21.833us 1.21% 26.523us 2.947us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.39% 8.492us 0.39% 8.492us 0.566us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.44% 9.570us 0.44% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.40% 8.750us 0.40% 8.750us 2.917us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.37% 7.980us 0.45% 9.772us 3.257us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.185ms
+Self CUDA time total: 38.432us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.836us 827.74% 339.836us 339.836us 1
+ torch_eager 6.54% 141.434us 99.74% 2.158ms 2.158ms 0.000us 0.00% 43.648us 43.648us 1
+ aten::conv1d 0.28% 6.090us 5.53% 119.574us 39.858us 0.000us 0.00% 25.407us 8.469us 3
+ aten::convolution 0.46% 9.939us 5.25% 113.484us 37.828us 0.000us 0.00% 25.407us 8.469us 3
+ aten::_convolution 1.12% 24.214us 4.79% 103.545us 34.515us 0.000us 0.00% 25.407us 8.469us 3
+ aten::_conv_depthwise2d 1.05% 22.612us 2.94% 63.593us 21.198us 25.407us 61.88% 25.407us 8.469us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.407us 61.88% 25.407us 8.469us 3
+ aten::to 0.29% 6.201us 86.38% 1.869ms 311.424us 0.000us 0.00% 18.241us 3.040us 6
+ aten::_to_copy 1.18% 25.424us 86.09% 1.862ms 310.391us 0.000us 0.00% 18.241us 3.040us 6
+ aten::copy_ 2.40% 51.862us 83.52% 1.807ms 301.107us 15.649us 38.12% 18.241us 3.040us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 20.27% 8.320us 2.773us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.329us 17.85% 7.329us 2.443us 3
+ Activity Buffer Request 68.07% 1.472ms 68.07% 1.472ms 1.472ms 2.592us 6.31% 2.592us 2.592us 1
+ aten::empty_strided 1.40% 30.280us 1.40% 30.280us 5.047us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 14.06% 304.169us 14.06% 304.169us 33.797us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.84% 18.230us 1.08% 23.418us 2.602us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.40% 8.619us 0.40% 8.619us 0.575us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 10.370us 0.48% 10.370us 3.457us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.41% 8.770us 0.41% 8.770us 2.923us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.26% 5.659us 0.32% 6.990us 2.330us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.163ms
+Self CUDA time total: 41.056us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.560us 329.80% 338.560us 338.560us 1
+ torch_eager 6.25% 131.427us 99.74% 2.098ms 2.098ms 0.000us 0.00% 108.608us 108.608us 1
+ aten::conv1d 0.29% 6.110us 5.71% 120.083us 40.028us 0.000us 0.00% 70.496us 23.499us 3
+ aten::convolution 0.47% 9.940us 5.42% 113.973us 37.991us 0.000us 0.00% 70.496us 23.499us 3
+ aten::_convolution 1.11% 23.441us 4.94% 104.033us 34.678us 0.000us 0.00% 70.496us 23.499us 3
+ aten::_conv_depthwise2d 1.04% 21.830us 2.93% 61.652us 20.551us 70.496us 68.67% 70.496us 23.499us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.496us 68.67% 70.496us 23.499us 3
+ aten::to 0.30% 6.292us 86.43% 1.818ms 303.059us 0.000us 0.00% 38.112us 6.352us 6
+ aten::_to_copy 1.17% 24.539us 86.13% 1.812ms 302.010us 0.000us 0.00% 38.112us 6.352us 6
+ aten::copy_ 2.47% 51.869us 83.58% 1.758ms 293.072us 32.160us 31.33% 38.112us 6.352us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.568us 17.11% 17.568us 5.856us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.592us 14.21% 14.592us 4.864us 3
+ Activity Buffer Request 67.63% 1.423ms 67.63% 1.423ms 1.423ms 5.952us 5.80% 5.952us 5.952us 1
+ aten::empty_strided 1.38% 29.091us 1.38% 29.091us 4.849us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 14.47% 304.542us 14.47% 304.542us 33.838us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.91% 19.049us 1.17% 24.579us 2.731us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.43% 9.070us 0.43% 9.070us 0.605us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.49% 10.351us 0.49% 10.351us 3.450us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.41% 8.621us 0.41% 8.621us 2.874us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.38% 8.050us 0.45% 9.470us 3.157us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.104ms
+Self CUDA time total: 102.656us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.578us 301.93% 340.578us 340.578us 1
+ torch_eager 6.29% 133.214us 99.74% 2.113ms 2.113ms 0.000us 0.00% 118.752us 118.752us 1
+ aten::conv1d 0.31% 6.499us 5.66% 119.974us 39.991us 0.000us 0.00% 80.576us 26.859us 3
+ aten::convolution 0.47% 9.880us 5.36% 113.475us 37.825us 0.000us 0.00% 80.576us 26.859us 3
+ aten::_convolution 1.21% 25.730us 4.89% 103.595us 34.532us 0.000us 0.00% 80.576us 26.859us 3
+ aten::_conv_depthwise2d 1.01% 21.361us 2.87% 60.832us 20.277us 80.576us 71.43% 80.576us 26.859us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.576us 71.43% 80.576us 26.859us 3
+ aten::to 0.33% 7.060us 86.42% 1.831ms 305.149us 0.000us 0.00% 38.176us 6.363us 6
+ aten::_to_copy 1.15% 24.352us 86.09% 1.824ms 303.972us 0.000us 0.00% 38.176us 6.363us 6
+ aten::copy_ 2.34% 49.642us 83.57% 1.770ms 295.075us 32.224us 28.57% 38.176us 6.363us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.664us 15.66% 17.664us 5.888us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.560us 12.91% 14.560us 4.853us 3
+ Activity Buffer Request 68.62% 1.454ms 68.62% 1.454ms 1.454ms 5.952us 5.28% 5.952us 5.952us 1
+ aten::empty_strided 1.37% 29.031us 1.37% 29.031us 4.838us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 13.59% 287.970us 13.59% 287.970us 31.997us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.89% 18.772us 1.17% 24.871us 2.763us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.520us 0.45% 9.520us 0.635us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.46% 9.850us 0.46% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.41% 8.670us 0.41% 8.670us 2.890us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.32% 6.821us 0.38% 8.112us 2.704us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.119ms
+Self CUDA time total: 112.800us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 6.32% 133.665us 99.60% 2.106ms 2.106ms 0.000us 0.00% 433.181us 433.181us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 423.869us 107.93% 423.869us 423.869us 1
+ aten::conv1d 0.30% 6.441us 5.98% 126.475us 42.158us 0.000us 0.00% 252.190us 84.063us 3
+ aten::convolution 0.49% 10.391us 5.68% 120.034us 40.011us 0.000us 0.00% 252.190us 84.063us 3
+ aten::_convolution 1.19% 25.110us 5.19% 109.643us 36.548us 0.000us 0.00% 252.190us 84.063us 3
+ aten::_conv_depthwise2d 1.07% 22.550us 3.14% 66.363us 22.121us 252.190us 64.21% 252.190us 84.063us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 252.190us 64.21% 252.190us 84.063us 3
+ aten::to 0.33% 6.989us 85.86% 1.815ms 302.520us 0.000us 0.00% 180.991us 30.165us 6
+ aten::_to_copy 1.18% 24.921us 85.53% 1.808ms 301.355us 0.000us 0.00% 180.991us 30.165us 6
+ aten::copy_ 2.39% 50.532us 82.93% 1.753ms 292.204us 140.543us 35.79% 180.991us 30.165us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 100.768us 25.66% 100.768us 33.589us 3
+ Activity Buffer Request 67.47% 1.426ms 67.47% 1.426ms 1.426ms 40.448us 10.30% 40.448us 40.448us 1
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.775us 10.13% 39.775us 13.258us 3
+ aten::empty_strided 1.42% 29.990us 1.42% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 14.15% 299.142us 14.15% 299.142us 33.238us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 19.400us 1.21% 25.500us 2.833us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.49% 10.430us 0.49% 10.430us 0.695us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.55% 11.580us 0.55% 11.580us 3.860us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.44% 9.361us 0.44% 9.361us 3.120us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 7.110us 0.42% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.114ms
+Self CUDA time total: 392.733us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 6.65% 143.166us 97.03% 2.090ms 2.090ms 0.000us 0.00% 486.301us 486.301us 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 477.853us 106.88% 477.853us 477.853us 1
+ aten::conv1d 0.33% 7.110us 5.88% 126.575us 42.192us 0.000us 0.00% 298.557us 99.519us 3
+ aten::convolution 0.51% 11.062us 5.55% 119.465us 39.822us 0.000us 0.00% 298.557us 99.519us 3
+ aten::_convolution 1.16% 25.071us 5.03% 108.403us 36.134us 0.000us 0.00% 298.557us 99.519us 3
+ aten::_conv_depthwise2d 1.05% 22.671us 3.05% 65.592us 21.864us 298.557us 66.78% 298.557us 99.519us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.557us 66.78% 298.557us 99.519us 3
+ aten::to 0.33% 7.030us 83.12% 1.790ms 298.407us 0.000us 0.00% 187.744us 31.291us 6
+ aten::_to_copy 1.22% 26.183us 82.80% 1.783ms 297.235us 0.000us 0.00% 187.744us 31.291us 6
+ aten::copy_ 2.41% 51.979us 80.11% 1.726ms 287.603us 148.544us 33.22% 187.744us 31.291us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.768us 24.33% 108.768us 36.256us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.776us 8.90% 39.776us 13.259us 3
+ Activity Buffer Request 66.10% 1.424ms 66.10% 1.424ms 1.424ms 39.200us 8.77% 39.200us 39.200us 1
+ aten::empty_strided 1.47% 31.611us 1.47% 31.611us 5.268us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 12.61% 271.569us 12.61% 271.569us 30.174us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.93% 19.971us 1.21% 26.011us 2.890us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.45% 9.711us 0.45% 9.711us 0.647us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.47% 10.061us 0.47% 10.061us 3.354us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.51% 11.040us 0.51% 11.040us 3.680us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.28% 5.950us 0.34% 7.400us 2.467us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.154ms
+Self CUDA time total: 447.101us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 355.165us 1897.25% 355.165us 355.165us 1
+ torch_eager 15.24% 136.376us 99.32% 888.600us 888.600us 0.000us 0.00% 20.608us 20.608us 1
+ aten::to 0.80% 7.121us 66.93% 598.831us 99.805us 0.000us 0.00% 13.376us 2.229us 6
+ aten::_to_copy 2.95% 26.380us 66.13% 591.710us 98.618us 0.000us 0.00% 13.376us 2.229us 6
+ aten::copy_ 5.90% 52.793us 59.34% 530.948us 88.491us 11.488us 61.37% 13.376us 2.229us 6
+ aten::conv1d 0.68% 6.050us 13.88% 124.163us 41.388us 0.000us 0.00% 7.232us 2.411us 3
+ aten::convolution 1.23% 10.987us 13.20% 118.113us 39.371us 0.000us 0.00% 7.232us 2.411us 3
+ aten::_convolution 2.78% 24.854us 11.97% 107.126us 35.709us 0.000us 0.00% 7.232us 2.411us 3
+ aten::_conv_depthwise2d 2.73% 24.470us 7.32% 65.481us 21.827us 7.232us 38.63% 7.232us 2.411us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 38.63% 7.232us 2.411us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 31.62% 5.920us 1.973us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 29.74% 5.568us 1.856us 3
+ Activity Buffer Request 26.68% 238.708us 26.68% 238.708us 238.708us 1.888us 10.09% 1.888us 1.888us 1
+ aten::empty_strided 3.84% 34.382us 3.84% 34.382us 5.730us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 29.10% 260.398us 29.10% 260.398us 28.933us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.02% 18.071us 2.57% 22.961us 2.551us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.97% 8.709us 0.97% 8.709us 0.581us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.22% 10.910us 1.22% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.02% 9.150us 1.02% 9.150us 3.050us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.75% 6.751us 0.92% 8.220us 2.740us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 894.710us
+Self CUDA time total: 18.720us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.578us 1674.05% 323.578us 323.578us 1
+ torch_eager 14.45% 120.436us 99.39% 828.559us 828.559us 0.000us 0.00% 21.217us 21.217us 1
+ aten::to 0.75% 6.271us 67.77% 564.939us 94.156us 0.000us 0.00% 13.377us 2.230us 6
+ aten::_to_copy 2.76% 22.992us 67.02% 558.668us 93.111us 0.000us 0.00% 13.377us 2.230us 6
+ aten::copy_ 5.96% 49.722us 60.74% 506.327us 84.388us 11.489us 59.44% 13.377us 2.230us 6
+ aten::conv1d 0.75% 6.211us 13.83% 115.254us 38.418us 0.000us 0.00% 7.840us 2.613us 3
+ aten::convolution 1.19% 9.930us 13.08% 109.043us 36.348us 0.000us 0.00% 7.840us 2.613us 3
+ aten::_convolution 2.77% 23.131us 11.89% 99.113us 33.038us 0.000us 0.00% 7.840us 2.613us 3
+ aten::_conv_depthwise2d 2.53% 21.092us 7.21% 60.132us 20.044us 7.840us 40.56% 7.840us 2.613us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 40.56% 7.840us 2.613us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.857us 30.30% 5.857us 1.952us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.14% 5.632us 1.877us 3
+ Activity Buffer Request 27.26% 227.207us 27.26% 227.207us 227.207us 1.888us 9.77% 1.888us 1.888us 1
+ aten::empty_strided 3.52% 29.349us 3.52% 29.349us 4.891us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 29.92% 249.418us 29.92% 249.418us 27.713us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.13% 17.749us 2.80% 23.370us 2.597us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.11% 9.261us 1.11% 9.261us 0.617us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.16% 9.660us 1.16% 9.660us 3.220us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.12% 9.360us 1.12% 9.360us 3.120us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.70% 5.810us 0.88% 7.370us 2.457us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 833.619us
+Self CUDA time total: 19.329us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 326.394us 1677.60% 326.394us 326.394us 1
+ torch_eager 14.78% 122.914us 99.34% 825.919us 825.919us 0.000us 0.00% 21.632us 21.632us 1
+ aten::to 0.79% 6.552us 67.16% 558.381us 93.064us 0.000us 0.00% 14.368us 2.395us 6
+ aten::_to_copy 2.94% 24.430us 66.37% 551.829us 91.971us 0.000us 0.00% 14.368us 2.395us 6
+ aten::copy_ 5.83% 48.462us 59.95% 498.427us 83.071us 12.192us 62.66% 14.368us 2.395us 6
+ aten::conv1d 0.71% 5.939us 14.00% 116.404us 38.801us 0.000us 0.00% 7.264us 2.421us 3
+ aten::convolution 1.18% 9.811us 13.29% 110.465us 36.822us 0.000us 0.00% 7.264us 2.421us 3
+ aten::_convolution 2.85% 23.732us 12.11% 100.654us 33.551us 0.000us 0.00% 7.264us 2.421us 3
+ aten::_conv_depthwise2d 2.52% 20.910us 7.24% 60.232us 20.077us 7.264us 37.34% 7.264us 2.421us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.264us 37.34% 7.264us 2.421us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.40% 6.304us 2.101us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 30.26% 5.888us 1.963us 3
+ Activity Buffer Request 26.68% 221.788us 26.68% 221.788us 221.788us 2.176us 11.18% 2.176us 2.176us 1
+ aten::empty_strided 3.48% 28.972us 3.48% 28.972us 4.829us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 30.05% 249.819us 30.05% 249.819us 27.758us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.04% 16.929us 2.67% 22.200us 2.467us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.07% 8.901us 1.07% 8.901us 0.593us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.15% 9.570us 1.15% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.98% 8.110us 0.98% 8.110us 2.703us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.86% 7.190us 1.02% 8.500us 2.833us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 831.399us
+Self CUDA time total: 19.456us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 356.696us 1774.96% 356.696us 356.696us 1
+ torch_eager 13.86% 123.804us 99.36% 887.440us 887.440us 0.000us 0.00% 22.272us 22.272us 1
+ aten::to 0.71% 6.320us 66.62% 595.061us 99.177us 0.000us 0.00% 14.368us 2.395us 6
+ aten::_to_copy 2.82% 25.151us 65.92% 588.741us 98.124us 0.000us 0.00% 14.368us 2.395us 6
+ aten::copy_ 5.73% 51.172us 59.67% 532.958us 88.826us 12.192us 60.67% 14.368us 2.395us 6
+ aten::conv1d 0.70% 6.210us 15.70% 140.195us 46.732us 0.000us 0.00% 7.904us 2.635us 3
+ aten::convolution 1.11% 9.881us 15.00% 133.985us 44.662us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_convolution 2.74% 24.510us 13.89% 124.104us 41.368us 0.000us 0.00% 7.904us 2.635us 3
+ aten::_conv_depthwise2d 2.70% 24.090us 9.26% 82.742us 27.581us 7.904us 39.33% 7.904us 2.635us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 39.33% 7.904us 2.635us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 31.05% 6.240us 2.080us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 29.62% 5.952us 1.984us 3
+ Activity Buffer Request 28.94% 258.459us 28.94% 258.459us 258.459us 2.176us 10.83% 2.176us 2.176us 1
+ aten::empty_strided 3.43% 30.632us 3.43% 30.632us 5.105us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 29.46% 263.129us 29.46% 263.129us 29.237us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.97% 17.620us 2.61% 23.310us 2.590us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.07% 9.580us 1.07% 9.580us 0.639us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.09% 9.720us 1.09% 9.720us 3.240us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.02% 9.130us 1.02% 9.130us 3.043us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.75% 6.702us 0.94% 8.422us 2.807us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 893.171us
+Self CUDA time total: 20.096us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.730us 926.72% 332.730us 332.730us 1
+ torch_eager 14.27% 126.064us 99.42% 878.341us 878.341us 0.000us 0.00% 38.496us 38.496us 1
+ aten::conv1d 0.64% 5.671us 13.39% 118.255us 39.418us 0.000us 0.00% 20.096us 6.699us 3
+ aten::convolution 1.11% 9.840us 12.74% 112.584us 37.528us 0.000us 0.00% 20.096us 6.699us 3
+ aten::_convolution 2.79% 24.681us 11.63% 102.744us 34.248us 0.000us 0.00% 20.096us 6.699us 3
+ aten::_conv_depthwise2d 2.42% 21.390us 7.02% 62.061us 20.687us 20.096us 55.97% 20.096us 6.699us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.096us 55.97% 20.096us 6.699us 3
+ aten::to 0.72% 6.320us 68.61% 606.182us 101.030us 0.000us 0.00% 18.400us 3.067us 6
+ aten::_to_copy 2.82% 24.900us 67.90% 599.862us 99.977us 0.000us 0.00% 18.400us 3.067us 6
+ aten::copy_ 5.62% 49.645us 61.77% 545.702us 90.950us 15.808us 44.03% 18.400us 3.067us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 23.53% 8.448us 2.816us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 20.50% 7.360us 2.453us 3
+ Activity Buffer Request 29.42% 259.919us 29.42% 259.919us 259.919us 2.592us 7.22% 2.592us 2.592us 1
+ aten::empty_strided 3.31% 29.260us 3.31% 29.260us 4.877us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 29.15% 257.559us 29.15% 257.559us 28.618us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.02% 17.842us 2.68% 23.662us 2.629us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.05% 9.271us 1.05% 9.271us 0.618us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.19% 10.540us 1.19% 10.540us 3.513us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.99% 8.710us 0.99% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.65% 5.719us 0.80% 7.050us 2.350us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 883.481us
+Self CUDA time total: 35.904us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 337.888us 888.80% 337.888us 337.888us 1
+ torch_eager 6.31% 128.615us 99.74% 2.033ms 2.033ms 0.000us 0.00% 40.576us 40.576us 1
+ aten::conv1d 0.31% 6.349us 5.98% 121.885us 40.628us 0.000us 0.00% 22.304us 7.435us 3
+ aten::convolution 0.53% 10.852us 5.67% 115.536us 38.512us 0.000us 0.00% 22.304us 7.435us 3
+ aten::_convolution 1.24% 25.291us 5.14% 104.684us 34.895us 0.000us 0.00% 22.304us 7.435us 3
+ aten::_conv_depthwise2d 1.08% 22.031us 3.01% 61.431us 20.477us 22.304us 58.67% 22.304us 7.435us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.304us 58.67% 22.304us 7.435us 3
+ aten::to 0.34% 6.829us 86.09% 1.755ms 292.477us 0.000us 0.00% 18.272us 3.045us 6
+ aten::_to_copy 1.20% 24.424us 85.75% 1.748ms 291.339us 0.000us 0.00% 18.272us 3.045us 6
+ aten::copy_ 2.48% 50.501us 83.10% 1.694ms 282.331us 15.712us 41.33% 18.272us 3.045us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 21.89% 8.320us 2.773us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 19.44% 7.392us 2.464us 3
+ Activity Buffer Request 69.75% 1.422ms 69.75% 1.422ms 1.422ms 2.560us 6.73% 2.560us 2.560us 1
+ aten::empty_strided 1.45% 29.621us 1.45% 29.621us 4.937us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 11.90% 242.506us 11.90% 242.506us 26.945us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.92% 18.701us 1.17% 23.851us 2.650us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.43% 8.710us 0.43% 8.710us 0.581us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.48% 9.800us 0.48% 9.800us 3.267us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.43% 8.710us 0.43% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.35% 7.191us 0.42% 8.621us 2.874us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 2.038ms
+Self CUDA time total: 38.016us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 362.972us 567.16% 362.972us 362.972us 1
+ torch_eager 14.84% 128.544us 99.34% 860.680us 860.680us 0.000us 0.00% 68.061us 68.061us 1
+ aten::conv1d 0.70% 6.079us 16.52% 143.165us 47.722us 0.000us 0.00% 41.728us 13.909us 3
+ aten::convolution 3.42% 29.613us 15.82% 137.086us 45.695us 0.000us 0.00% 41.728us 13.909us 3
+ aten::_convolution 2.86% 24.759us 12.40% 107.473us 35.824us 0.000us 0.00% 41.728us 13.909us 3
+ aten::_conv_depthwise2d 2.59% 22.439us 7.67% 66.492us 22.164us 41.728us 65.20% 41.728us 13.909us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.728us 65.20% 41.728us 13.909us 3
+ aten::to 0.77% 6.631us 64.71% 560.621us 93.437us 0.000us 0.00% 26.333us 4.389us 6
+ aten::_to_copy 2.80% 24.253us 63.94% 553.990us 92.332us 0.000us 0.00% 26.333us 4.389us 6
+ aten::copy_ 5.80% 50.240us 57.50% 498.196us 83.033us 22.270us 34.80% 26.333us 4.389us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.903us 18.60% 11.903us 3.968us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.367us 16.20% 10.367us 3.456us 3
+ Activity Buffer Request 26.05% 225.728us 26.05% 225.728us 225.728us 4.063us 6.35% 4.063us 4.063us 1
+ aten::empty_strided 3.64% 31.541us 3.64% 31.541us 5.257us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 28.31% 245.279us 28.31% 245.279us 27.253us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.11% 18.263us 2.74% 23.752us 2.639us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.06% 9.199us 1.06% 9.199us 0.613us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.26% 10.941us 1.26% 10.941us 3.647us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.16% 10.061us 1.16% 10.061us 3.354us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.66% 5.740us 0.85% 7.330us 2.443us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 866.380us
+Self CUDA time total: 63.998us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.311us 512.91% 357.311us 357.311us 1
+ torch_eager 20.96% 191.619us 99.38% 908.662us 908.662us 0.000us 0.00% 73.696us 73.696us 1
+ aten::conv1d 0.63% 5.760us 15.23% 139.294us 46.431us 0.000us 0.00% 47.296us 15.765us 3
+ aten::convolution 2.87% 26.271us 14.60% 133.534us 44.511us 0.000us 0.00% 47.296us 15.765us 3
+ aten::_convolution 2.77% 25.360us 11.73% 107.263us 35.754us 0.000us 0.00% 47.296us 15.765us 3
+ aten::_conv_depthwise2d 2.38% 21.722us 7.17% 65.523us 21.841us 47.296us 67.89% 47.296us 15.765us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.296us 67.89% 47.296us 15.765us 3
+ aten::to 0.73% 6.650us 60.08% 549.318us 91.553us 0.000us 0.00% 26.400us 4.400us 6
+ aten::_to_copy 2.63% 24.032us 59.35% 542.668us 90.445us 0.000us 0.00% 26.400us 4.400us 6
+ aten::copy_ 5.57% 50.922us 53.46% 488.786us 81.464us 22.368us 32.11% 26.400us 4.400us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.872us 17.04% 11.872us 3.957us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 15.07% 10.496us 3.499us 3
+ Activity Buffer Request 23.91% 218.617us 23.91% 218.617us 218.617us 4.032us 5.79% 4.032us 4.032us 1
+ aten::empty_strided 3.26% 29.850us 3.26% 29.850us 4.975us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 26.57% 242.937us 26.57% 242.937us 26.993us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.04% 18.652us 2.65% 24.251us 2.695us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.01% 9.230us 1.01% 9.230us 0.615us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.08% 9.870us 1.08% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.12% 10.241us 1.12% 10.241us 3.414us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.63% 5.780us 0.80% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 914.323us
+Self CUDA time total: 69.664us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.092us 187.26% 348.092us 348.092us 1
+ torch_eager 14.76% 124.374us 99.29% 836.558us 836.558us 0.000us 0.00% 195.870us 195.870us 1
+ aten::conv1d 0.70% 5.900us 14.42% 121.504us 40.501us 0.000us 0.00% 133.406us 44.469us 3
+ aten::convolution 1.14% 9.610us 13.72% 115.604us 38.535us 0.000us 0.00% 133.406us 44.469us 3
+ aten::_convolution 2.88% 24.263us 12.58% 105.994us 35.331us 0.000us 0.00% 133.406us 44.469us 3
+ aten::_conv_depthwise2d 2.73% 23.010us 7.80% 65.750us 21.917us 133.406us 71.77% 133.406us 44.469us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.406us 71.77% 133.406us 44.469us 3
+ aten::to 0.74% 6.220us 66.83% 563.060us 93.843us 0.000us 0.00% 62.464us 10.411us 6
+ aten::_to_copy 2.83% 23.861us 66.09% 556.840us 92.807us 0.000us 0.00% 62.464us 10.411us 6
+ aten::copy_ 6.03% 50.810us 59.73% 503.287us 83.881us 52.480us 28.23% 62.464us 10.411us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.600us 15.92% 29.600us 9.867us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 12.31% 22.880us 7.627us 3
+ Activity Buffer Request 25.69% 216.468us 25.69% 216.468us 216.468us 9.984us 5.37% 9.984us 9.984us 1
+ aten::empty_strided 3.52% 29.692us 3.52% 29.692us 4.949us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 30.59% 257.739us 30.59% 257.739us 28.638us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.08% 17.540us 2.73% 23.000us 2.556us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.12% 9.412us 1.12% 9.412us 0.627us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.20% 10.110us 1.20% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.29% 10.900us 1.29% 10.900us 3.633us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.68% 5.719us 0.88% 7.451us 2.484us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 842.539us
+Self CUDA time total: 185.886us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.403us 166.18% 348.403us 348.403us 1
+ torch_eager 14.60% 122.924us 99.33% 836.209us 836.209us 0.000us 0.00% 223.383us 223.383us 1
+ aten::conv1d 0.69% 5.779us 14.01% 117.955us 39.318us 0.000us 0.00% 153.883us 51.294us 3
+ aten::convolution 1.25% 10.491us 13.32% 112.176us 37.392us 0.000us 0.00% 153.883us 51.294us 3
+ aten::_convolution 2.91% 24.484us 12.08% 101.685us 33.895us 0.000us 0.00% 153.883us 51.294us 3
+ aten::_conv_depthwise2d 2.49% 20.928us 7.14% 60.070us 20.023us 153.883us 73.40% 153.883us 51.294us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.883us 73.40% 153.883us 51.294us 3
+ aten::to 0.73% 6.179us 67.37% 567.200us 94.533us 0.000us 0.00% 69.500us 11.583us 6
+ aten::_to_copy 2.75% 23.132us 66.64% 561.021us 93.504us 0.000us 0.00% 69.500us 11.583us 6
+ aten::copy_ 5.91% 49.740us 60.39% 508.377us 84.729us 55.773us 26.60% 69.500us 11.583us 6
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.927us 15.71% 32.927us 10.976us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.846us 10.90% 22.846us 7.615us 3
+ Activity Buffer Request 29.09% 244.869us 29.09% 244.869us 244.869us 13.727us 6.55% 13.727us 13.727us 1
+ aten::empty_strided 3.51% 29.512us 3.51% 29.512us 4.919us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 27.84% 234.420us 27.84% 234.420us 26.047us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 2.13% 17.973us 2.77% 23.320us 2.591us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 1.09% 9.167us 1.09% 9.167us 0.611us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 1.12% 9.440us 1.12% 9.440us 3.147us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 1.07% 9.050us 1.07% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.85% 7.121us 1.02% 8.601us 2.867us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 841.880us
+Self CUDA time total: 209.656us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 7.22% 135.785us 57.39% 1.079ms 1.079ms 0.000us 0.00% 1.518ms 1.518ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.419ms 100.41% 1.419ms 1.419ms 1
+ aten::to 0.37% 6.901us 40.86% 768.526us 128.088us 0.000us 0.00% 823.221us 137.204us 6
+ aten::_to_copy 1.63% 30.742us 40.49% 761.625us 126.938us 0.000us 0.00% 823.221us 137.204us 6
+ aten::copy_ 2.94% 55.302us 27.81% 523.157us 87.193us 717.942us 50.81% 823.221us 137.204us 6
+ aten::conv1d 0.33% 6.280us 6.71% 126.144us 42.048us 0.000us 0.00% 695.094us 231.698us 3
+ aten::convolution 0.57% 10.750us 6.37% 119.864us 39.955us 0.000us 0.00% 695.094us 231.698us 3
+ aten::_convolution 1.35% 25.400us 5.80% 109.114us 36.371us 0.000us 0.00% 695.094us 231.698us 3
+ aten::_conv_depthwise2d 1.19% 22.332us 3.55% 66.763us 22.254us 695.094us 49.19% 695.094us 231.698us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 695.094us 49.19% 695.094us 231.698us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 411.706us 29.14% 411.706us 137.235us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 306.236us 21.67% 306.236us 102.079us 3
+ Activity Buffer Request 12.99% 244.238us 12.99% 244.238us 244.238us 105.279us 7.45% 105.279us 105.279us 1
+ aten::empty_strided 2.17% 40.811us 11.04% 207.726us 34.621us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 13.13% 246.997us 13.13% 246.997us 27.444us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 1.97% 37.133us 2.36% 44.413us 4.935us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.58% 10.889us 0.58% 10.889us 0.726us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.53% 10.051us 0.53% 10.051us 3.350us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.58% 11.000us 0.58% 11.000us 3.667us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.34% 6.350us 0.41% 7.700us 2.567us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 1.881ms
+Self CUDA time total: 1.413ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
+======================================================================
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ torch_eager 4.25% 132.984us 66.63% 2.083ms 2.083ms 0.000us 0.00% 1.503ms 1.503ms 1
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.434ms 100.41% 1.434ms 1.434ms 1
+ aten::to 0.21% 6.470us 57.53% 1.798ms 299.656us 0.000us 0.00% 765.147us 127.524us 6
+ aten::_to_copy 0.80% 25.009us 57.32% 1.791ms 298.577us 0.000us 0.00% 765.147us 127.524us 6
+ aten::copy_ 1.51% 47.155us 55.55% 1.736ms 289.360us 690.492us 48.35% 765.147us 127.524us 6
+ aten::conv1d 0.20% 6.231us 3.91% 122.325us 40.775us 0.000us 0.00% 737.724us 245.908us 3
+ aten::convolution 0.32% 9.920us 3.71% 116.094us 38.698us 0.000us 0.00% 737.724us 245.908us 3
+ aten::_convolution 0.82% 25.623us 3.40% 106.174us 35.391us 0.000us 0.00% 737.724us 245.908us 3
+ aten::_conv_depthwise2d 0.70% 21.899us 1.98% 62.011us 20.670us 737.724us 51.65% 737.724us 245.908us 3
+void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.724us 51.65% 737.724us 245.908us 3
+void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 398.046us 27.87% 398.046us 132.682us 3
+void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 292.446us 20.48% 292.446us 97.482us 3
+ Activity Buffer Request 47.19% 1.475ms 47.19% 1.475ms 1.475ms 74.655us 5.23% 74.655us 74.655us 1
+ aten::empty_strided 0.97% 30.293us 0.97% 30.293us 5.049us 0.000us 0.00% 0.000us 0.000us 6
+ cudaLaunchKernel 7.52% 235.026us 7.52% 235.026us 26.114us 0.000us 0.00% 0.000us 0.000us 9
+ aten::unsqueeze 0.60% 18.740us 0.79% 24.820us 2.758us 0.000us 0.00% 0.000us 0.000us 9
+ aten::as_strided 0.32% 10.019us 0.32% 10.019us 0.668us 0.000us 0.00% 0.000us 0.000us 15
+ aten::empty 0.32% 9.882us 0.32% 9.882us 3.294us 0.000us 0.00% 0.000us 0.000us 3
+ aten::resize_ 0.29% 9.220us 0.29% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3
+ aten::squeeze 0.24% 7.471us 0.29% 9.160us 3.053us 0.000us 0.00% 0.000us 0.000us 3
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 3.125ms
+Self CUDA time total: 1.428ms
+
+
+impl wl p50(ms) ok
+torch_eager cuda_B2_D2048_S128_W2 0.08 True
+torch_eager cuda_B2_D2048_S128_W4 0.09 True
+torch_eager cuda_B2_D2048_S2048_W2 0.15 True
+torch_eager cuda_B2_D2048_S2048_W4 0.16 True
+torch_eager cuda_B2_D2048_S512_W2 0.08 True
+torch_eager cuda_B2_D2048_S512_W4 0.08 True
+torch_eager cuda_B2_D64_S128_W2 0.07 True
+torch_eager cuda_B2_D64_S128_W4 0.09 True
+torch_eager cuda_B2_D64_S2048_W2 0.09 True
+torch_eager cuda_B2_D64_S2048_W4 0.08 True
+torch_eager cuda_B2_D64_S512_W2 0.09 True
+torch_eager cuda_B2_D64_S512_W4 0.09 True
+torch_eager cuda_B4_D2048_S128_W2 0.09 True
+torch_eager cuda_B4_D2048_S128_W4 0.08 True
+torch_eager cuda_B4_D2048_S2048_W2 0.49 True
+torch_eager cuda_B4_D2048_S2048_W4 0.50 True
+torch_eager cuda_B4_D2048_S512_W2 0.09 True
+torch_eager cuda_B4_D2048_S512_W4 0.10 True
+torch_eager cuda_B4_D64_S128_W2 0.08 True
+torch_eager cuda_B4_D64_S128_W4 0.08 True
+torch_eager cuda_B4_D64_S2048_W2 0.08 True
+torch_eager cuda_B4_D64_S2048_W4 0.09 True
+torch_eager cuda_B4_D64_S512_W2 0.08 True
+torch_eager cuda_B4_D64_S512_W4 0.08 True
+