+

PyTorch Native - Causal Conv1D

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.23s + | + +Raw +GitHub +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 29 00:36:08 2025       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
+| N/A   30C    P0             87W /  350W |       0MiB /  46068MiB |     18%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+
+
+
+
+ +

Causal Conv1D Benchmark (PyTorch Native)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 7.30s + | + +Raw +GitHub +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch==2.8.0",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
+# ///
+import torch
+import torch.nn.functional as F
+import sys
+from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
+
+
+def torch_causal_conv1d(input_tensor, weight, bias):
+    # Convert to weight dtype for computation
+    x = input_tensor.to(weight.dtype)
+    dim = weight.shape[0]
+    width = weight.shape[1]
+    seqlen = input_tensor.shape[-1]
+
+    # Depthwise causal conv1d using PyTorch
+    out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+
+    # Truncate to original sequence length
+    out = out[..., :seqlen]
+
+    # Convert back to original dtype
+    return out.to(input_tensor.dtype)
+
+
+run_benchmark(
+    kernel_type=KernelTypeEnum.CAUSAL_CONV1D,
+    impl_name="torch_eager",
+    impl_tags={"family": "pytorch", "backend": "eager"},
+    impl_func=torch_causal_conv1d,
+)
+
+ +
+
+
+
+
Running causal_conv1d benchmark on cuda with 24 workloads.
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     460.509us      2386.43%     460.509us     460.509us             1  
+                                            torch_eager        10.46%     229.787us        99.65%       2.189ms       2.189ms       0.000us         0.00%      21.633us      21.633us             1  
+                                               aten::to         0.59%      12.913us        79.38%       1.743ms     290.578us       0.000us         0.00%      14.272us       2.379us             6  
+                                         aten::_to_copy         1.99%      43.750us        78.79%       1.731ms     288.426us       0.000us         0.00%      14.272us       2.379us             6  
+                                            aten::copy_         2.89%      63.562us        74.16%       1.629ms     271.469us      11.936us        61.85%      14.272us       2.379us             6  
+                                           aten::conv1d         0.44%       9.671us         7.66%     168.306us      56.102us       0.000us         0.00%       7.361us       2.454us             3  
+                                      aten::convolution         0.72%      15.890us         7.22%     158.635us      52.878us       0.000us         0.00%       7.361us       2.454us             3  
+                                     aten::_convolution         1.69%      37.102us         6.50%     142.745us      47.582us       0.000us         0.00%       7.361us       2.454us             3  
+                                aten::_conv_depthwise2d         1.60%      35.230us         3.77%      82.773us      27.591us       7.361us        38.15%       7.361us       2.454us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        38.15%       7.361us       2.454us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.50%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.35%       5.664us       1.888us             3  
+                                Activity Buffer Request        68.26%       1.499ms        68.26%       1.499ms       1.499ms       2.336us        12.11%       2.336us       2.336us             1  
+                                    aten::empty_strided         2.64%      57.992us         2.64%      57.992us       9.665us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.12%      90.443us         4.12%      90.443us      10.049us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.47%      32.392us         1.88%      41.212us       4.579us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.64%      14.011us         0.64%      14.011us       0.934us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      12.120us         0.55%      12.120us       4.040us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.50%      10.961us         0.50%      10.961us       3.654us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.43%       9.410us         0.51%      11.220us       3.740us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.196ms
+Self CUDA time total: 19.297us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     350.557us      1795.89%     350.557us     350.557us             1  
+                                            torch_eager         6.82%     130.236us        99.71%       1.905ms       1.905ms       0.000us         0.00%      21.632us      21.632us             1  
+                                               aten::to         0.35%       6.597us        84.97%       1.623ms     270.580us       0.000us         0.00%      13.728us       2.288us             6  
+                                         aten::_to_copy         1.27%      24.323us        84.63%       1.617ms     269.481us       0.000us         0.00%      13.728us       2.288us             6  
+                                            aten::copy_         2.68%      51.130us        81.67%       1.560ms     260.072us      11.616us        59.51%      13.728us       2.288us             6  
+                                           aten::conv1d         0.33%       6.400us         6.43%     122.914us      40.971us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         0.52%       9.901us         6.10%     116.514us      38.838us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         1.28%      24.410us         5.58%     106.613us      35.538us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         1.25%      23.932us         3.35%      63.983us      21.328us       7.904us        40.49%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        40.49%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        31.15%       6.080us       2.027us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.536us        28.36%       5.536us       1.845us             3  
+                                Activity Buffer Request        76.19%       1.456ms        76.19%       1.456ms       1.456ms       2.112us        10.82%       2.112us       2.112us             1  
+                                    aten::empty_strided         1.68%      32.131us         1.68%      32.131us       5.355us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.93%      75.003us         3.93%      75.003us       8.334us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.97%      18.540us         1.29%      24.620us       2.736us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.51%       9.711us         0.51%       9.711us       0.647us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%       9.650us         0.51%       9.650us       3.217us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.47%       9.000us         0.47%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.37%       7.100us         0.45%       8.560us       2.853us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.911ms
+Self CUDA time total: 19.520us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     379.390us      2047.55%     379.390us     379.390us             1  
+                                            torch_eager         8.20%     159.835us        99.65%       1.942ms       1.942ms       0.000us         0.00%      20.449us      20.449us             1  
+                                               aten::to         0.37%       7.179us        83.32%       1.624ms     270.686us       0.000us         0.00%      13.536us       2.256us             6  
+                                         aten::_to_copy         1.40%      27.213us        82.96%       1.617ms     269.489us       0.000us         0.00%      13.536us       2.256us             6  
+                                            aten::copy_         2.62%      51.160us        79.92%       1.558ms     259.635us      11.616us        62.69%      13.536us       2.256us             6  
+                                           aten::conv1d         0.34%       6.560us         6.49%     126.453us      42.151us       0.000us         0.00%       6.913us       2.304us             3  
+                                      aten::convolution         0.57%      11.119us         6.15%     119.893us      39.964us       0.000us         0.00%       6.913us       2.304us             3  
+                                     aten::_convolution         1.29%      25.191us         5.58%     108.774us      36.258us       0.000us         0.00%       6.913us       2.304us             3  
+                                aten::_conv_depthwise2d         1.16%      22.580us         3.36%      65.502us      21.834us       6.913us        37.31%       6.913us       2.304us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.913us        37.31%       6.913us       2.304us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        31.95%       5.920us       1.973us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.74%       5.696us       1.899us             3  
+                                Activity Buffer Request        74.82%       1.458ms        74.82%       1.458ms       1.458ms       1.920us        10.36%       1.920us       1.920us             1  
+                                    aten::empty_strided         1.64%      31.911us         1.64%      31.911us       5.319us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.59%      70.043us         3.59%      70.043us       7.783us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.01%      19.612us         1.35%      26.392us       2.932us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.55%      10.750us         0.55%      10.750us       0.717us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.62%      12.182us         0.62%      12.182us       4.061us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.46%       8.910us         0.46%       8.910us       2.970us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.35%       6.890us         0.42%       8.260us       2.753us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.949ms
+Self CUDA time total: 18.529us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.058us      1736.41%     340.058us     340.058us             1  
+                                            torch_eager         6.15%     129.375us        99.74%       2.097ms       2.097ms       0.000us         0.00%      21.760us      21.760us             1  
+                                               aten::to         0.32%       6.700us        86.45%       1.818ms     303.002us       0.000us         0.00%      14.112us       2.352us             6  
+                                         aten::_to_copy         1.17%      24.651us        86.13%       1.811ms     301.886us       0.000us         0.00%      14.112us       2.352us             6  
+                                            aten::copy_         2.42%      50.883us        83.54%       1.757ms     292.785us      11.936us        60.95%      14.112us       2.352us             6  
+                                           aten::conv1d         0.30%       6.290us         5.74%     120.803us      40.268us       0.000us         0.00%       7.648us       2.549us             3  
+                                      aten::convolution         0.48%      10.020us         5.45%     114.513us      38.171us       0.000us         0.00%       7.648us       2.549us             3  
+                                     aten::_convolution         1.15%      24.209us         4.97%     104.493us      34.831us       0.000us         0.00%       7.648us       2.549us             3  
+                                aten::_conv_depthwise2d         1.00%      21.080us         2.93%      61.691us      20.564us       7.648us        39.05%       7.648us       2.549us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.648us        39.05%       7.648us       2.549us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        31.70%       6.208us       2.069us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.25%       5.728us       1.909us             3  
+                                Activity Buffer Request        71.15%       1.496ms        71.15%       1.496ms       1.496ms       2.176us        11.11%       2.176us       2.176us             1  
+                                    aten::empty_strided         1.42%      29.951us         1.42%      29.951us       4.992us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.98%     230.807us        10.98%     230.807us      25.645us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.94%      19.863us         1.21%      25.543us       2.838us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.46%       9.630us         0.46%       9.630us       0.642us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.50%      10.541us         0.50%      10.541us       3.514us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       8.810us         0.42%       8.810us       2.937us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.35%       7.411us         0.44%       9.201us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.103ms
+Self CUDA time total: 19.584us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.070us      1381.53%     339.070us     339.070us             1  
+                                            torch_eager         6.44%     132.135us        99.72%       2.045ms       2.045ms       0.000us         0.00%      26.814us      26.814us             1  
+                                               aten::to         0.33%       6.722us        86.08%       1.765ms     294.155us       0.000us         0.00%      15.262us       2.544us             6  
+                                         aten::_to_copy         1.20%      24.702us        85.75%       1.758ms     293.035us       0.000us         0.00%      15.262us       2.544us             6  
+                                            aten::copy_         2.39%      49.030us        83.04%       1.702ms     283.750us      12.991us        52.93%      15.262us       2.544us             6  
+                                           aten::conv1d         0.29%       5.850us         5.78%     118.603us      39.534us       0.000us         0.00%      11.552us       3.851us             3  
+                                      aten::convolution         0.55%      11.220us         5.50%     112.753us      37.584us       0.000us         0.00%      11.552us       3.851us             3  
+                                     aten::_convolution         1.18%      24.170us         4.95%     101.533us      33.844us       0.000us         0.00%      11.552us       3.851us             3  
+                                aten::_conv_depthwise2d         1.08%      22.212us         2.99%      61.273us      20.424us      11.552us        47.07%      11.552us       3.851us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.07%      11.552us       3.851us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.655us        27.12%       6.655us       2.218us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        25.82%       6.336us       2.112us             3  
+                                Activity Buffer Request        71.25%       1.461ms        71.25%       1.461ms       1.461ms       2.271us         9.25%       2.271us       2.271us             1  
+                                    aten::empty_strided         1.51%      31.010us         1.51%      31.010us       5.168us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.41%     213.527us        10.41%     213.527us      23.725us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.350us         1.15%      23.660us       2.629us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.131us         0.45%       9.131us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.46%       9.481us         0.46%       9.481us       3.160us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.760us         0.43%       8.760us       2.920us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       5.520us         0.33%       6.850us       2.283us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.050ms
+Self CUDA time total: 24.543us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.129us      1305.15%     339.129us     339.129us             1  
+                                            torch_eager         6.29%     128.886us        99.74%       2.043ms       2.043ms       0.000us         0.00%      28.224us      28.224us             1  
+                                               aten::to         0.34%       6.902us        86.10%       1.763ms     293.882us       0.000us         0.00%      15.168us       2.528us             6  
+                                         aten::_to_copy         1.23%      25.190us        85.76%       1.756ms     292.731us       0.000us         0.00%      15.168us       2.528us             6  
+                                            aten::copy_         2.41%      49.270us        83.08%       1.701ms     283.571us      12.928us        49.75%      15.168us       2.528us             6  
+                                           aten::conv1d         0.31%       6.370us         5.92%     121.333us      40.444us       0.000us         0.00%      13.056us       4.352us             3  
+                                      aten::convolution         0.49%      10.120us         5.61%     114.963us      38.321us       0.000us         0.00%      13.056us       4.352us             3  
+                                     aten::_convolution         1.25%      25.500us         5.12%     104.843us      34.948us       0.000us         0.00%      13.056us       4.352us             3  
+                                aten::_conv_depthwise2d         1.08%      22.212us         3.04%      62.243us      20.748us      13.056us        50.25%      13.056us       4.352us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.056us        50.25%      13.056us       4.352us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        25.37%       6.592us       2.197us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.38%       6.336us       2.112us             3  
+                                Activity Buffer Request        71.41%       1.463ms        71.41%       1.463ms       1.463ms       2.240us         8.62%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.45%      29.770us         1.45%      29.770us       4.962us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        10.25%     209.968us        10.25%     209.968us      23.330us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      18.870us         1.21%      24.780us       2.753us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.601us         0.47%       9.601us       0.640us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%      10.510us         0.51%      10.510us       3.503us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.181us         0.45%       9.181us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.640us         0.40%       8.140us       2.713us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.048ms
+Self CUDA time total: 25.984us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     362.270us       942.63%     362.270us     362.270us             1  
+                                            torch_eager         7.50%     163.876us        99.75%       2.180ms       2.180ms       0.000us         0.00%      40.993us      40.993us             1  
+                                           aten::conv1d         0.34%       7.388us         5.94%     129.794us      43.265us       0.000us         0.00%      22.464us       7.488us             3  
+                                      aten::convolution         0.56%      12.301us         5.60%     122.406us      40.802us       0.000us         0.00%      22.464us       7.488us             3  
+                                     aten::_convolution         1.18%      25.829us         5.04%     110.105us      36.702us       0.000us         0.00%      22.464us       7.488us             3  
+                                aten::_conv_depthwise2d         1.07%      23.371us         2.94%      64.311us      21.437us      22.464us        58.45%      22.464us       7.488us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.464us        58.45%      22.464us       7.488us             3  
+                                               aten::to         0.36%       7.830us        84.95%       1.856ms     309.406us       0.000us         0.00%      18.529us       3.088us             6  
+                                         aten::_to_copy         1.44%      31.560us        84.59%       1.849ms     308.101us       0.000us         0.00%      18.529us       3.088us             6  
+                                            aten::copy_         2.41%      52.633us        81.64%       1.784ms     297.326us      15.968us        41.55%      18.529us       3.088us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.609us        22.40%       8.609us       2.870us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.15%       7.359us       2.453us             3  
+                                Activity Buffer Request        65.39%       1.429ms        65.39%       1.429ms       1.429ms       2.561us         6.66%       2.561us       2.561us             1  
+                                    aten::empty_strided         1.51%      33.091us         1.51%      33.091us       5.515us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.87%     325.052us        14.87%     325.052us      36.117us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.00%      21.833us         1.21%      26.523us       2.947us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.39%       8.492us         0.39%       8.492us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.44%       9.570us         0.44%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.40%       8.750us         0.40%       8.750us       2.917us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.37%       7.980us         0.45%       9.772us       3.257us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.185ms
+Self CUDA time total: 38.432us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.836us       827.74%     339.836us     339.836us             1  
+                                            torch_eager         6.54%     141.434us        99.74%       2.158ms       2.158ms       0.000us         0.00%      43.648us      43.648us             1  
+                                           aten::conv1d         0.28%       6.090us         5.53%     119.574us      39.858us       0.000us         0.00%      25.407us       8.469us             3  
+                                      aten::convolution         0.46%       9.939us         5.25%     113.484us      37.828us       0.000us         0.00%      25.407us       8.469us             3  
+                                     aten::_convolution         1.12%      24.214us         4.79%     103.545us      34.515us       0.000us         0.00%      25.407us       8.469us             3  
+                                aten::_conv_depthwise2d         1.05%      22.612us         2.94%      63.593us      21.198us      25.407us        61.88%      25.407us       8.469us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.407us        61.88%      25.407us       8.469us             3  
+                                               aten::to         0.29%       6.201us        86.38%       1.869ms     311.424us       0.000us         0.00%      18.241us       3.040us             6  
+                                         aten::_to_copy         1.18%      25.424us        86.09%       1.862ms     310.391us       0.000us         0.00%      18.241us       3.040us             6  
+                                            aten::copy_         2.40%      51.862us        83.52%       1.807ms     301.107us      15.649us        38.12%      18.241us       3.040us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.320us        20.27%       8.320us       2.773us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.329us        17.85%       7.329us       2.443us             3  
+                                Activity Buffer Request        68.07%       1.472ms        68.07%       1.472ms       1.472ms       2.592us         6.31%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.40%      30.280us         1.40%      30.280us       5.047us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.06%     304.169us        14.06%     304.169us      33.797us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.84%      18.230us         1.08%      23.418us       2.602us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.40%       8.619us         0.40%       8.619us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%      10.370us         0.48%      10.370us       3.457us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.770us         0.41%       8.770us       2.923us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.26%       5.659us         0.32%       6.990us       2.330us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.163ms
+Self CUDA time total: 41.056us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.560us       329.80%     338.560us     338.560us             1  
+                                            torch_eager         6.25%     131.427us        99.74%       2.098ms       2.098ms       0.000us         0.00%     108.608us     108.608us             1  
+                                           aten::conv1d         0.29%       6.110us         5.71%     120.083us      40.028us       0.000us         0.00%      70.496us      23.499us             3  
+                                      aten::convolution         0.47%       9.940us         5.42%     113.973us      37.991us       0.000us         0.00%      70.496us      23.499us             3  
+                                     aten::_convolution         1.11%      23.441us         4.94%     104.033us      34.678us       0.000us         0.00%      70.496us      23.499us             3  
+                                aten::_conv_depthwise2d         1.04%      21.830us         2.93%      61.652us      20.551us      70.496us        68.67%      70.496us      23.499us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.496us        68.67%      70.496us      23.499us             3  
+                                               aten::to         0.30%       6.292us        86.43%       1.818ms     303.059us       0.000us         0.00%      38.112us       6.352us             6  
+                                         aten::_to_copy         1.17%      24.539us        86.13%       1.812ms     302.010us       0.000us         0.00%      38.112us       6.352us             6  
+                                            aten::copy_         2.47%      51.869us        83.58%       1.758ms     293.072us      32.160us        31.33%      38.112us       6.352us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.568us        17.11%      17.568us       5.856us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.592us        14.21%      14.592us       4.864us             3  
+                                Activity Buffer Request        67.63%       1.423ms        67.63%       1.423ms       1.423ms       5.952us         5.80%       5.952us       5.952us             1  
+                                    aten::empty_strided         1.38%      29.091us         1.38%      29.091us       4.849us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.47%     304.542us        14.47%     304.542us      33.838us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.91%      19.049us         1.17%      24.579us       2.731us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       9.070us         0.43%       9.070us       0.605us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.49%      10.351us         0.49%      10.351us       3.450us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.621us         0.41%       8.621us       2.874us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.38%       8.050us         0.45%       9.470us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.104ms
+Self CUDA time total: 102.656us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.578us       301.93%     340.578us     340.578us             1  
+                                            torch_eager         6.29%     133.214us        99.74%       2.113ms       2.113ms       0.000us         0.00%     118.752us     118.752us             1  
+                                           aten::conv1d         0.31%       6.499us         5.66%     119.974us      39.991us       0.000us         0.00%      80.576us      26.859us             3  
+                                      aten::convolution         0.47%       9.880us         5.36%     113.475us      37.825us       0.000us         0.00%      80.576us      26.859us             3  
+                                     aten::_convolution         1.21%      25.730us         4.89%     103.595us      34.532us       0.000us         0.00%      80.576us      26.859us             3  
+                                aten::_conv_depthwise2d         1.01%      21.361us         2.87%      60.832us      20.277us      80.576us        71.43%      80.576us      26.859us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.576us        71.43%      80.576us      26.859us             3  
+                                               aten::to         0.33%       7.060us        86.42%       1.831ms     305.149us       0.000us         0.00%      38.176us       6.363us             6  
+                                         aten::_to_copy         1.15%      24.352us        86.09%       1.824ms     303.972us       0.000us         0.00%      38.176us       6.363us             6  
+                                            aten::copy_         2.34%      49.642us        83.57%       1.770ms     295.075us      32.224us        28.57%      38.176us       6.363us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.664us        15.66%      17.664us       5.888us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.560us        12.91%      14.560us       4.853us             3  
+                                Activity Buffer Request        68.62%       1.454ms        68.62%       1.454ms       1.454ms       5.952us         5.28%       5.952us       5.952us             1  
+                                    aten::empty_strided         1.37%      29.031us         1.37%      29.031us       4.838us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.59%     287.970us        13.59%     287.970us      31.997us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.772us         1.17%      24.871us       2.763us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.520us         0.45%       9.520us       0.635us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.46%       9.850us         0.46%       9.850us       3.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.41%       8.670us         0.41%       8.670us       2.890us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.821us         0.38%       8.112us       2.704us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.119ms
+Self CUDA time total: 112.800us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         6.32%     133.665us        99.60%       2.106ms       2.106ms       0.000us         0.00%     433.181us     433.181us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     423.869us       107.93%     423.869us     423.869us             1  
+                                           aten::conv1d         0.30%       6.441us         5.98%     126.475us      42.158us       0.000us         0.00%     252.190us      84.063us             3  
+                                      aten::convolution         0.49%      10.391us         5.68%     120.034us      40.011us       0.000us         0.00%     252.190us      84.063us             3  
+                                     aten::_convolution         1.19%      25.110us         5.19%     109.643us      36.548us       0.000us         0.00%     252.190us      84.063us             3  
+                                aten::_conv_depthwise2d         1.07%      22.550us         3.14%      66.363us      22.121us     252.190us        64.21%     252.190us      84.063us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     252.190us        64.21%     252.190us      84.063us             3  
+                                               aten::to         0.33%       6.989us        85.86%       1.815ms     302.520us       0.000us         0.00%     180.991us      30.165us             6  
+                                         aten::_to_copy         1.18%      24.921us        85.53%       1.808ms     301.355us       0.000us         0.00%     180.991us      30.165us             6  
+                                            aten::copy_         2.39%      50.532us        82.93%       1.753ms     292.204us     140.543us        35.79%     180.991us      30.165us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     100.768us        25.66%     100.768us      33.589us             3  
+                                Activity Buffer Request        67.47%       1.426ms        67.47%       1.426ms       1.426ms      40.448us        10.30%      40.448us      40.448us             1  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.775us        10.13%      39.775us      13.258us             3  
+                                    aten::empty_strided         1.42%      29.990us         1.42%      29.990us       4.998us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        14.15%     299.142us        14.15%     299.142us      33.238us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      19.400us         1.21%      25.500us       2.833us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.49%      10.430us         0.49%      10.430us       0.695us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.55%      11.580us         0.55%      11.580us       3.860us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       9.361us         0.44%       9.361us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       7.110us         0.42%       8.900us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.114ms
+Self CUDA time total: 392.733us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         6.65%     143.166us        97.03%       2.090ms       2.090ms       0.000us         0.00%     486.301us     486.301us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     477.853us       106.88%     477.853us     477.853us             1  
+                                           aten::conv1d         0.33%       7.110us         5.88%     126.575us      42.192us       0.000us         0.00%     298.557us      99.519us             3  
+                                      aten::convolution         0.51%      11.062us         5.55%     119.465us      39.822us       0.000us         0.00%     298.557us      99.519us             3  
+                                     aten::_convolution         1.16%      25.071us         5.03%     108.403us      36.134us       0.000us         0.00%     298.557us      99.519us             3  
+                                aten::_conv_depthwise2d         1.05%      22.671us         3.05%      65.592us      21.864us     298.557us        66.78%     298.557us      99.519us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.557us        66.78%     298.557us      99.519us             3  
+                                               aten::to         0.33%       7.030us        83.12%       1.790ms     298.407us       0.000us         0.00%     187.744us      31.291us             6  
+                                         aten::_to_copy         1.22%      26.183us        82.80%       1.783ms     297.235us       0.000us         0.00%     187.744us      31.291us             6  
+                                            aten::copy_         2.41%      51.979us        80.11%       1.726ms     287.603us     148.544us        33.22%     187.744us      31.291us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.768us        24.33%     108.768us      36.256us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.776us         8.90%      39.776us      13.259us             3  
+                                Activity Buffer Request        66.10%       1.424ms        66.10%       1.424ms       1.424ms      39.200us         8.77%      39.200us      39.200us             1  
+                                    aten::empty_strided         1.47%      31.611us         1.47%      31.611us       5.268us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.61%     271.569us        12.61%     271.569us      30.174us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.93%      19.971us         1.21%      26.011us       2.890us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.711us         0.45%       9.711us       0.647us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%      10.061us         0.47%      10.061us       3.354us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%      11.040us         0.51%      11.040us       3.680us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.950us         0.34%       7.400us       2.467us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.154ms
+Self CUDA time total: 447.101us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     355.165us      1897.25%     355.165us     355.165us             1  
+                                            torch_eager        15.24%     136.376us        99.32%     888.600us     888.600us       0.000us         0.00%      20.608us      20.608us             1  
+                                               aten::to         0.80%       7.121us        66.93%     598.831us      99.805us       0.000us         0.00%      13.376us       2.229us             6  
+                                         aten::_to_copy         2.95%      26.380us        66.13%     591.710us      98.618us       0.000us         0.00%      13.376us       2.229us             6  
+                                            aten::copy_         5.90%      52.793us        59.34%     530.948us      88.491us      11.488us        61.37%      13.376us       2.229us             6  
+                                           aten::conv1d         0.68%       6.050us        13.88%     124.163us      41.388us       0.000us         0.00%       7.232us       2.411us             3  
+                                      aten::convolution         1.23%      10.987us        13.20%     118.113us      39.371us       0.000us         0.00%       7.232us       2.411us             3  
+                                     aten::_convolution         2.78%      24.854us        11.97%     107.126us      35.709us       0.000us         0.00%       7.232us       2.411us             3  
+                                aten::_conv_depthwise2d         2.73%      24.470us         7.32%      65.481us      21.827us       7.232us        38.63%       7.232us       2.411us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.232us        38.63%       7.232us       2.411us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        31.62%       5.920us       1.973us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        29.74%       5.568us       1.856us             3  
+                                Activity Buffer Request        26.68%     238.708us        26.68%     238.708us     238.708us       1.888us        10.09%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.84%      34.382us         3.84%      34.382us       5.730us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.10%     260.398us        29.10%     260.398us      28.933us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.02%      18.071us         2.57%      22.961us       2.551us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.97%       8.709us         0.97%       8.709us       0.581us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.22%      10.910us         1.22%      10.910us       3.637us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.02%       9.150us         1.02%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       6.751us         0.92%       8.220us       2.740us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 894.710us
+Self CUDA time total: 18.720us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.578us      1674.05%     323.578us     323.578us             1  
+                                            torch_eager        14.45%     120.436us        99.39%     828.559us     828.559us       0.000us         0.00%      21.217us      21.217us             1  
+                                               aten::to         0.75%       6.271us        67.77%     564.939us      94.156us       0.000us         0.00%      13.377us       2.230us             6  
+                                         aten::_to_copy         2.76%      22.992us        67.02%     558.668us      93.111us       0.000us         0.00%      13.377us       2.230us             6  
+                                            aten::copy_         5.96%      49.722us        60.74%     506.327us      84.388us      11.489us        59.44%      13.377us       2.230us             6  
+                                           aten::conv1d         0.75%       6.211us        13.83%     115.254us      38.418us       0.000us         0.00%       7.840us       2.613us             3  
+                                      aten::convolution         1.19%       9.930us        13.08%     109.043us      36.348us       0.000us         0.00%       7.840us       2.613us             3  
+                                     aten::_convolution         2.77%      23.131us        11.89%      99.113us      33.038us       0.000us         0.00%       7.840us       2.613us             3  
+                                aten::_conv_depthwise2d         2.53%      21.092us         7.21%      60.132us      20.044us       7.840us        40.56%       7.840us       2.613us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        40.56%       7.840us       2.613us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.857us        30.30%       5.857us       1.952us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.14%       5.632us       1.877us             3  
+                                Activity Buffer Request        27.26%     227.207us        27.26%     227.207us     227.207us       1.888us         9.77%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.52%      29.349us         3.52%      29.349us       4.891us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.92%     249.418us        29.92%     249.418us      27.713us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      17.749us         2.80%      23.370us       2.597us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.11%       9.261us         1.11%       9.261us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.16%       9.660us         1.16%       9.660us       3.220us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%       9.360us         1.12%       9.360us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.70%       5.810us         0.88%       7.370us       2.457us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 833.619us
+Self CUDA time total: 19.329us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.394us      1677.60%     326.394us     326.394us             1  
+                                            torch_eager        14.78%     122.914us        99.34%     825.919us     825.919us       0.000us         0.00%      21.632us      21.632us             1  
+                                               aten::to         0.79%       6.552us        67.16%     558.381us      93.064us       0.000us         0.00%      14.368us       2.395us             6  
+                                         aten::_to_copy         2.94%      24.430us        66.37%     551.829us      91.971us       0.000us         0.00%      14.368us       2.395us             6  
+                                            aten::copy_         5.83%      48.462us        59.95%     498.427us      83.071us      12.192us        62.66%      14.368us       2.395us             6  
+                                           aten::conv1d         0.71%       5.939us        14.00%     116.404us      38.801us       0.000us         0.00%       7.264us       2.421us             3  
+                                      aten::convolution         1.18%       9.811us        13.29%     110.465us      36.822us       0.000us         0.00%       7.264us       2.421us             3  
+                                     aten::_convolution         2.85%      23.732us        12.11%     100.654us      33.551us       0.000us         0.00%       7.264us       2.421us             3  
+                                aten::_conv_depthwise2d         2.52%      20.910us         7.24%      60.232us      20.077us       7.264us        37.34%       7.264us       2.421us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.264us        37.34%       7.264us       2.421us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.40%       6.304us       2.101us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        30.26%       5.888us       1.963us             3  
+                                Activity Buffer Request        26.68%     221.788us        26.68%     221.788us     221.788us       2.176us        11.18%       2.176us       2.176us             1  
+                                    aten::empty_strided         3.48%      28.972us         3.48%      28.972us       4.829us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        30.05%     249.819us        30.05%     249.819us      27.758us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.04%      16.929us         2.67%      22.200us       2.467us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       8.901us         1.07%       8.901us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.15%       9.570us         1.15%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.98%       8.110us         0.98%       8.110us       2.703us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.86%       7.190us         1.02%       8.500us       2.833us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 831.399us
+Self CUDA time total: 19.456us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     356.696us      1774.96%     356.696us     356.696us             1  
+                                            torch_eager        13.86%     123.804us        99.36%     887.440us     887.440us       0.000us         0.00%      22.272us      22.272us             1  
+                                               aten::to         0.71%       6.320us        66.62%     595.061us      99.177us       0.000us         0.00%      14.368us       2.395us             6  
+                                         aten::_to_copy         2.82%      25.151us        65.92%     588.741us      98.124us       0.000us         0.00%      14.368us       2.395us             6  
+                                            aten::copy_         5.73%      51.172us        59.67%     532.958us      88.826us      12.192us        60.67%      14.368us       2.395us             6  
+                                           aten::conv1d         0.70%       6.210us        15.70%     140.195us      46.732us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         1.11%       9.881us        15.00%     133.985us      44.662us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         2.74%      24.510us        13.89%     124.104us      41.368us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         2.70%      24.090us         9.26%      82.742us      27.581us       7.904us        39.33%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        39.33%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.05%       6.240us       2.080us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us        29.62%       5.952us       1.984us             3  
+                                Activity Buffer Request        28.94%     258.459us        28.94%     258.459us     258.459us       2.176us        10.83%       2.176us       2.176us             1  
+                                    aten::empty_strided         3.43%      30.632us         3.43%      30.632us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.46%     263.129us        29.46%     263.129us      29.237us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.97%      17.620us         2.61%      23.310us       2.590us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.07%       9.580us         1.07%       9.580us       0.639us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.09%       9.720us         1.09%       9.720us       3.240us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.02%       9.130us         1.02%       9.130us       3.043us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.75%       6.702us         0.94%       8.422us       2.807us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 893.171us
+Self CUDA time total: 20.096us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.730us       926.72%     332.730us     332.730us             1  
+                                            torch_eager        14.27%     126.064us        99.42%     878.341us     878.341us       0.000us         0.00%      38.496us      38.496us             1  
+                                           aten::conv1d         0.64%       5.671us        13.39%     118.255us      39.418us       0.000us         0.00%      20.096us       6.699us             3  
+                                      aten::convolution         1.11%       9.840us        12.74%     112.584us      37.528us       0.000us         0.00%      20.096us       6.699us             3  
+                                     aten::_convolution         2.79%      24.681us        11.63%     102.744us      34.248us       0.000us         0.00%      20.096us       6.699us             3  
+                                aten::_conv_depthwise2d         2.42%      21.390us         7.02%      62.061us      20.687us      20.096us        55.97%      20.096us       6.699us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.096us        55.97%      20.096us       6.699us             3  
+                                               aten::to         0.72%       6.320us        68.61%     606.182us     101.030us       0.000us         0.00%      18.400us       3.067us             6  
+                                         aten::_to_copy         2.82%      24.900us        67.90%     599.862us      99.977us       0.000us         0.00%      18.400us       3.067us             6  
+                                            aten::copy_         5.62%      49.645us        61.77%     545.702us      90.950us      15.808us        44.03%      18.400us       3.067us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        23.53%       8.448us       2.816us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        20.50%       7.360us       2.453us             3  
+                                Activity Buffer Request        29.42%     259.919us        29.42%     259.919us     259.919us       2.592us         7.22%       2.592us       2.592us             1  
+                                    aten::empty_strided         3.31%      29.260us         3.31%      29.260us       4.877us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.15%     257.559us        29.15%     257.559us      28.618us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.02%      17.842us         2.68%      23.662us       2.629us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.05%       9.271us         1.05%       9.271us       0.618us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.19%      10.540us         1.19%      10.540us       3.513us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.99%       8.710us         0.99%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.65%       5.719us         0.80%       7.050us       2.350us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 883.481us
+Self CUDA time total: 35.904us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     337.888us       888.80%     337.888us     337.888us             1  
+                                            torch_eager         6.31%     128.615us        99.74%       2.033ms       2.033ms       0.000us         0.00%      40.576us      40.576us             1  
+                                           aten::conv1d         0.31%       6.349us         5.98%     121.885us      40.628us       0.000us         0.00%      22.304us       7.435us             3  
+                                      aten::convolution         0.53%      10.852us         5.67%     115.536us      38.512us       0.000us         0.00%      22.304us       7.435us             3  
+                                     aten::_convolution         1.24%      25.291us         5.14%     104.684us      34.895us       0.000us         0.00%      22.304us       7.435us             3  
+                                aten::_conv_depthwise2d         1.08%      22.031us         3.01%      61.431us      20.477us      22.304us        58.67%      22.304us       7.435us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.304us        58.67%      22.304us       7.435us             3  
+                                               aten::to         0.34%       6.829us        86.09%       1.755ms     292.477us       0.000us         0.00%      18.272us       3.045us             6  
+                                         aten::_to_copy         1.20%      24.424us        85.75%       1.748ms     291.339us       0.000us         0.00%      18.272us       3.045us             6  
+                                            aten::copy_         2.48%      50.501us        83.10%       1.694ms     282.331us      15.712us        41.33%      18.272us       3.045us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.320us        21.89%       8.320us       2.773us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        19.44%       7.392us       2.464us             3  
+                                Activity Buffer Request        69.75%       1.422ms        69.75%       1.422ms       1.422ms       2.560us         6.73%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.45%      29.621us         1.45%      29.621us       4.937us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.90%     242.506us        11.90%     242.506us      26.945us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      18.701us         1.17%      23.851us       2.650us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.710us         0.43%       8.710us       0.581us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.800us         0.48%       9.800us       3.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.710us         0.43%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.35%       7.191us         0.42%       8.621us       2.874us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 2.038ms
+Self CUDA time total: 38.016us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     362.972us       567.16%     362.972us     362.972us             1  
+                                            torch_eager        14.84%     128.544us        99.34%     860.680us     860.680us       0.000us         0.00%      68.061us      68.061us             1  
+                                           aten::conv1d         0.70%       6.079us        16.52%     143.165us      47.722us       0.000us         0.00%      41.728us      13.909us             3  
+                                      aten::convolution         3.42%      29.613us        15.82%     137.086us      45.695us       0.000us         0.00%      41.728us      13.909us             3  
+                                     aten::_convolution         2.86%      24.759us        12.40%     107.473us      35.824us       0.000us         0.00%      41.728us      13.909us             3  
+                                aten::_conv_depthwise2d         2.59%      22.439us         7.67%      66.492us      22.164us      41.728us        65.20%      41.728us      13.909us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.728us        65.20%      41.728us      13.909us             3  
+                                               aten::to         0.77%       6.631us        64.71%     560.621us      93.437us       0.000us         0.00%      26.333us       4.389us             6  
+                                         aten::_to_copy         2.80%      24.253us        63.94%     553.990us      92.332us       0.000us         0.00%      26.333us       4.389us             6  
+                                            aten::copy_         5.80%      50.240us        57.50%     498.196us      83.033us      22.270us        34.80%      26.333us       4.389us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.903us        18.60%      11.903us       3.968us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.367us        16.20%      10.367us       3.456us             3  
+                                Activity Buffer Request        26.05%     225.728us        26.05%     225.728us     225.728us       4.063us         6.35%       4.063us       4.063us             1  
+                                    aten::empty_strided         3.64%      31.541us         3.64%      31.541us       5.257us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        28.31%     245.279us        28.31%     245.279us      27.253us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      18.263us         2.74%      23.752us       2.639us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.06%       9.199us         1.06%       9.199us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.26%      10.941us         1.26%      10.941us       3.647us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.16%      10.061us         1.16%      10.061us       3.354us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.66%       5.740us         0.85%       7.330us       2.443us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 866.380us
+Self CUDA time total: 63.998us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.311us       512.91%     357.311us     357.311us             1  
+                                            torch_eager        20.96%     191.619us        99.38%     908.662us     908.662us       0.000us         0.00%      73.696us      73.696us             1  
+                                           aten::conv1d         0.63%       5.760us        15.23%     139.294us      46.431us       0.000us         0.00%      47.296us      15.765us             3  
+                                      aten::convolution         2.87%      26.271us        14.60%     133.534us      44.511us       0.000us         0.00%      47.296us      15.765us             3  
+                                     aten::_convolution         2.77%      25.360us        11.73%     107.263us      35.754us       0.000us         0.00%      47.296us      15.765us             3  
+                                aten::_conv_depthwise2d         2.38%      21.722us         7.17%      65.523us      21.841us      47.296us        67.89%      47.296us      15.765us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.296us        67.89%      47.296us      15.765us             3  
+                                               aten::to         0.73%       6.650us        60.08%     549.318us      91.553us       0.000us         0.00%      26.400us       4.400us             6  
+                                         aten::_to_copy         2.63%      24.032us        59.35%     542.668us      90.445us       0.000us         0.00%      26.400us       4.400us             6  
+                                            aten::copy_         5.57%      50.922us        53.46%     488.786us      81.464us      22.368us        32.11%      26.400us       4.400us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.872us        17.04%      11.872us       3.957us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us        15.07%      10.496us       3.499us             3  
+                                Activity Buffer Request        23.91%     218.617us        23.91%     218.617us     218.617us       4.032us         5.79%       4.032us       4.032us             1  
+                                    aten::empty_strided         3.26%      29.850us         3.26%      29.850us       4.975us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        26.57%     242.937us        26.57%     242.937us      26.993us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.04%      18.652us         2.65%      24.251us       2.695us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       9.230us         1.01%       9.230us       0.615us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.08%       9.870us         1.08%       9.870us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.12%      10.241us         1.12%      10.241us       3.414us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.63%       5.780us         0.80%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 914.323us
+Self CUDA time total: 69.664us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.092us       187.26%     348.092us     348.092us             1  
+                                            torch_eager        14.76%     124.374us        99.29%     836.558us     836.558us       0.000us         0.00%     195.870us     195.870us             1  
+                                           aten::conv1d         0.70%       5.900us        14.42%     121.504us      40.501us       0.000us         0.00%     133.406us      44.469us             3  
+                                      aten::convolution         1.14%       9.610us        13.72%     115.604us      38.535us       0.000us         0.00%     133.406us      44.469us             3  
+                                     aten::_convolution         2.88%      24.263us        12.58%     105.994us      35.331us       0.000us         0.00%     133.406us      44.469us             3  
+                                aten::_conv_depthwise2d         2.73%      23.010us         7.80%      65.750us      21.917us     133.406us        71.77%     133.406us      44.469us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.406us        71.77%     133.406us      44.469us             3  
+                                               aten::to         0.74%       6.220us        66.83%     563.060us      93.843us       0.000us         0.00%      62.464us      10.411us             6  
+                                         aten::_to_copy         2.83%      23.861us        66.09%     556.840us      92.807us       0.000us         0.00%      62.464us      10.411us             6  
+                                            aten::copy_         6.03%      50.810us        59.73%     503.287us      83.881us      52.480us        28.23%      62.464us      10.411us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.600us        15.92%      29.600us       9.867us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.880us        12.31%      22.880us       7.627us             3  
+                                Activity Buffer Request        25.69%     216.468us        25.69%     216.468us     216.468us       9.984us         5.37%       9.984us       9.984us             1  
+                                    aten::empty_strided         3.52%      29.692us         3.52%      29.692us       4.949us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        30.59%     257.739us        30.59%     257.739us      28.638us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.08%      17.540us         2.73%      23.000us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.12%       9.412us         1.12%       9.412us       0.627us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%      10.110us         1.20%      10.110us       3.370us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.29%      10.900us         1.29%      10.900us       3.633us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.719us         0.88%       7.451us       2.484us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 842.539us
+Self CUDA time total: 185.886us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.403us       166.18%     348.403us     348.403us             1  
+                                            torch_eager        14.60%     122.924us        99.33%     836.209us     836.209us       0.000us         0.00%     223.383us     223.383us             1  
+                                           aten::conv1d         0.69%       5.779us        14.01%     117.955us      39.318us       0.000us         0.00%     153.883us      51.294us             3  
+                                      aten::convolution         1.25%      10.491us        13.32%     112.176us      37.392us       0.000us         0.00%     153.883us      51.294us             3  
+                                     aten::_convolution         2.91%      24.484us        12.08%     101.685us      33.895us       0.000us         0.00%     153.883us      51.294us             3  
+                                aten::_conv_depthwise2d         2.49%      20.928us         7.14%      60.070us      20.023us     153.883us        73.40%     153.883us      51.294us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     153.883us        73.40%     153.883us      51.294us             3  
+                                               aten::to         0.73%       6.179us        67.37%     567.200us      94.533us       0.000us         0.00%      69.500us      11.583us             6  
+                                         aten::_to_copy         2.75%      23.132us        66.64%     561.021us      93.504us       0.000us         0.00%      69.500us      11.583us             6  
+                                            aten::copy_         5.91%      49.740us        60.39%     508.377us      84.729us      55.773us        26.60%      69.500us      11.583us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.927us        15.71%      32.927us      10.976us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.846us        10.90%      22.846us       7.615us             3  
+                                Activity Buffer Request        29.09%     244.869us        29.09%     244.869us     244.869us      13.727us         6.55%      13.727us      13.727us             1  
+                                    aten::empty_strided         3.51%      29.512us         3.51%      29.512us       4.919us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        27.84%     234.420us        27.84%     234.420us      26.047us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      17.973us         2.77%      23.320us       2.591us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       9.167us         1.09%       9.167us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.12%       9.440us         1.12%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.07%       9.050us         1.07%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.85%       7.121us         1.02%       8.601us       2.867us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 841.880us
+Self CUDA time total: 209.656us
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         7.22%     135.785us        57.39%       1.079ms       1.079ms       0.000us         0.00%       1.518ms       1.518ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.419ms       100.41%       1.419ms       1.419ms             1  
+                                               aten::to         0.37%       6.901us        40.86%     768.526us     128.088us       0.000us         0.00%     823.221us     137.204us             6  
+                                         aten::_to_copy         1.63%      30.742us        40.49%     761.625us     126.938us       0.000us         0.00%     823.221us     137.204us             6  
+                                            aten::copy_         2.94%      55.302us        27.81%     523.157us      87.193us     717.942us        50.81%     823.221us     137.204us             6  
+                                           aten::conv1d         0.33%       6.280us         6.71%     126.144us      42.048us       0.000us         0.00%     695.094us     231.698us             3  
+                                      aten::convolution         0.57%      10.750us         6.37%     119.864us      39.955us       0.000us         0.00%     695.094us     231.698us             3  
+                                     aten::_convolution         1.35%      25.400us         5.80%     109.114us      36.371us       0.000us         0.00%     695.094us     231.698us             3  
+                                aten::_conv_depthwise2d         1.19%      22.332us         3.55%      66.763us      22.254us     695.094us        49.19%     695.094us     231.698us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     695.094us        49.19%     695.094us     231.698us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     411.706us        29.14%     411.706us     137.235us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     306.236us        21.67%     306.236us     102.079us             3  
+                                Activity Buffer Request        12.99%     244.238us        12.99%     244.238us     244.238us     105.279us         7.45%     105.279us     105.279us             1  
+                                    aten::empty_strided         2.17%      40.811us        11.04%     207.726us      34.621us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.13%     246.997us        13.13%     246.997us      27.444us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.97%      37.133us         2.36%      44.413us       4.935us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.58%      10.889us         0.58%      10.889us       0.726us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.53%      10.051us         0.53%      10.051us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.58%      11.000us         0.58%      11.000us       3.667us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       6.350us         0.41%       7.700us       2.567us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 1.881ms
+Self CUDA time total: 1.413ms
+
+
+
+======================================================================
+PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
+======================================================================
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                            torch_eager         4.25%     132.984us        66.63%       2.083ms       2.083ms       0.000us         0.00%       1.503ms       1.503ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.434ms       100.41%       1.434ms       1.434ms             1  
+                                               aten::to         0.21%       6.470us        57.53%       1.798ms     299.656us       0.000us         0.00%     765.147us     127.524us             6  
+                                         aten::_to_copy         0.80%      25.009us        57.32%       1.791ms     298.577us       0.000us         0.00%     765.147us     127.524us             6  
+                                            aten::copy_         1.51%      47.155us        55.55%       1.736ms     289.360us     690.492us        48.35%     765.147us     127.524us             6  
+                                           aten::conv1d         0.20%       6.231us         3.91%     122.325us      40.775us       0.000us         0.00%     737.724us     245.908us             3  
+                                      aten::convolution         0.32%       9.920us         3.71%     116.094us      38.698us       0.000us         0.00%     737.724us     245.908us             3  
+                                     aten::_convolution         0.82%      25.623us         3.40%     106.174us      35.391us       0.000us         0.00%     737.724us     245.908us             3  
+                                aten::_conv_depthwise2d         0.70%      21.899us         1.98%      62.011us      20.670us     737.724us        51.65%     737.724us     245.908us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     737.724us        51.65%     737.724us     245.908us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     398.046us        27.87%     398.046us     132.682us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     292.446us        20.48%     292.446us      97.482us             3  
+                                Activity Buffer Request        47.19%       1.475ms        47.19%       1.475ms       1.475ms      74.655us         5.23%      74.655us      74.655us             1  
+                                    aten::empty_strided         0.97%      30.293us         0.97%      30.293us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         7.52%     235.026us         7.52%     235.026us      26.114us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.60%      18.740us         0.79%      24.820us       2.758us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.32%      10.019us         0.32%      10.019us       0.668us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.32%       9.882us         0.32%       9.882us       3.294us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.29%       9.220us         0.29%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.24%       7.471us         0.29%       9.160us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 3.125ms
+Self CUDA time total: 1.428ms
+
+
+impl                     wl                  p50(ms)  ok
+torch_eager              cuda_B2_D2048_S128_W2     0.08  True
+torch_eager              cuda_B2_D2048_S128_W4     0.09  True
+torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
+torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
+torch_eager              cuda_B2_D2048_S512_W2     0.08  True
+torch_eager              cuda_B2_D2048_S512_W4     0.08  True
+torch_eager              cuda_B2_D64_S128_W2     0.07  True
+torch_eager              cuda_B2_D64_S128_W4     0.09  True
+torch_eager              cuda_B2_D64_S2048_W2     0.09  True
+torch_eager              cuda_B2_D64_S2048_W4     0.08  True
+torch_eager              cuda_B2_D64_S512_W2     0.09  True
+torch_eager              cuda_B2_D64_S512_W4     0.09  True
+torch_eager              cuda_B4_D2048_S128_W2     0.09  True
+torch_eager              cuda_B4_D2048_S128_W4     0.08  True
+torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
+torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
+torch_eager              cuda_B4_D2048_S512_W2     0.09  True
+torch_eager              cuda_B4_D2048_S512_W4     0.10  True
+torch_eager              cuda_B4_D64_S128_W2     0.08  True
+torch_eager              cuda_B4_D64_S128_W4     0.08  True
+torch_eager              cuda_B4_D64_S2048_W2     0.08  True
+torch_eager              cuda_B4_D64_S2048_W4     0.09  True
+torch_eager              cuda_B4_D64_S512_W2     0.08  True
+torch_eager              cuda_B4_D64_S512_W4     0.08  True
+
+
+
▶ UV Install Logs
+ +
+
+

Artifacts:

+causal_conv1d.jsonl +
+
+
+