diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.25s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
-
Wed Oct 29 00:36:08 2025       
+
Wed Oct 29 04:14:16 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
@@ -3896,7 +3896,7 @@ Cell: nv | 0.23s
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
-| N/A   30C    P0             87W /  350W |       0MiB /  46068MiB |     18%      Default |
+| N/A   35C    P0            121W /  350W |       0MiB /  46068MiB |    100%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 
@@ -3920,7 +3920,7 @@ Cell: nv | 0.23s
 ▼ output
  ▶ uv-logs
  | 
-Cell: benchmark | 7.30s
+Cell: benchmark | 7.31s
  | 
 
 Raw
@@ -3982,29 +3982,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     460.509us      2386.43%     460.509us     460.509us             1  
-                                            torch_eager        10.46%     229.787us        99.65%       2.189ms       2.189ms       0.000us         0.00%      21.633us      21.633us             1  
-                                               aten::to         0.59%      12.913us        79.38%       1.743ms     290.578us       0.000us         0.00%      14.272us       2.379us             6  
-                                         aten::_to_copy         1.99%      43.750us        78.79%       1.731ms     288.426us       0.000us         0.00%      14.272us       2.379us             6  
-                                            aten::copy_         2.89%      63.562us        74.16%       1.629ms     271.469us      11.936us        61.85%      14.272us       2.379us             6  
-                                           aten::conv1d         0.44%       9.671us         7.66%     168.306us      56.102us       0.000us         0.00%       7.361us       2.454us             3  
-                                      aten::convolution         0.72%      15.890us         7.22%     158.635us      52.878us       0.000us         0.00%       7.361us       2.454us             3  
-                                     aten::_convolution         1.69%      37.102us         6.50%     142.745us      47.582us       0.000us         0.00%       7.361us       2.454us             3  
-                                aten::_conv_depthwise2d         1.60%      35.230us         3.77%      82.773us      27.591us       7.361us        38.15%       7.361us       2.454us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.361us        38.15%       7.361us       2.454us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.50%       6.272us       2.091us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.35%       5.664us       1.888us             3  
-                                Activity Buffer Request        68.26%       1.499ms        68.26%       1.499ms       1.499ms       2.336us        12.11%       2.336us       2.336us             1  
-                                    aten::empty_strided         2.64%      57.992us         2.64%      57.992us       9.665us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         4.12%      90.443us         4.12%      90.443us      10.049us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.47%      32.392us         1.88%      41.212us       4.579us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.64%      14.011us         0.64%      14.011us       0.934us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      12.120us         0.55%      12.120us       4.040us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.50%      10.961us         0.50%      10.961us       3.654us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.43%       9.410us         0.51%      11.220us       3.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     467.679us      2403.78%     467.679us     467.679us             1  
+                                            torch_eager        10.81%     233.756us        99.62%       2.155ms       2.155ms       0.000us         0.00%      21.792us      21.792us             1  
+                                               aten::to         0.55%      11.919us        78.64%       1.701ms     283.539us       0.000us         0.00%      14.304us       2.384us             6  
+                                         aten::_to_copy         2.04%      44.223us        78.09%       1.689ms     281.553us       0.000us         0.00%      14.304us       2.384us             6  
+                                            aten::copy_         3.07%      66.360us        73.27%       1.585ms     264.169us      11.968us        61.51%      14.304us       2.384us             6  
+                                           aten::conv1d         0.40%       8.600us         7.96%     172.134us      57.378us       0.000us         0.00%       7.488us       2.496us             3  
+                                      aten::convolution         0.76%      16.533us         7.56%     163.534us      54.511us       0.000us         0.00%       7.488us       2.496us             3  
+                                     aten::_convolution         1.65%      35.660us         6.80%     147.001us      49.000us       0.000us         0.00%       7.488us       2.496us             3  
+                                aten::_conv_depthwise2d         1.78%      38.520us         4.15%      89.871us      29.957us       7.488us        38.49%       7.488us       2.496us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.488us        38.49%       7.488us       2.496us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.24%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        29.28%       5.696us       1.899us             3  
+                                Activity Buffer Request        67.06%       1.451ms        67.06%       1.451ms       1.451ms       2.336us        12.01%       2.336us       2.336us             1  
+                                    aten::empty_strided         2.78%      60.080us         2.78%      60.080us      10.013us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         4.39%      95.004us         4.39%      95.004us      10.556us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.49%      32.209us         1.86%      40.319us       4.480us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.61%      13.180us         0.61%      13.180us       0.879us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.57%      12.310us         0.57%      12.310us       4.103us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.56%      12.130us         0.56%      12.130us       4.043us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.40%       8.601us         0.48%      10.281us       3.427us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.196ms
-Self CUDA time total: 19.297us
+Self CPU time total: 2.163ms
+Self CUDA time total: 19.456us
 
 
 
@@ -4014,29 +4014,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     350.557us      1795.89%     350.557us     350.557us             1  
-                                            torch_eager         6.82%     130.236us        99.71%       1.905ms       1.905ms       0.000us         0.00%      21.632us      21.632us             1  
-                                               aten::to         0.35%       6.597us        84.97%       1.623ms     270.580us       0.000us         0.00%      13.728us       2.288us             6  
-                                         aten::_to_copy         1.27%      24.323us        84.63%       1.617ms     269.481us       0.000us         0.00%      13.728us       2.288us             6  
-                                            aten::copy_         2.68%      51.130us        81.67%       1.560ms     260.072us      11.616us        59.51%      13.728us       2.288us             6  
-                                           aten::conv1d         0.33%       6.400us         6.43%     122.914us      40.971us       0.000us         0.00%       7.904us       2.635us             3  
-                                      aten::convolution         0.52%       9.901us         6.10%     116.514us      38.838us       0.000us         0.00%       7.904us       2.635us             3  
-                                     aten::_convolution         1.28%      24.410us         5.58%     106.613us      35.538us       0.000us         0.00%       7.904us       2.635us             3  
-                                aten::_conv_depthwise2d         1.25%      23.932us         3.35%      63.983us      21.328us       7.904us        40.49%       7.904us       2.635us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        40.49%       7.904us       2.635us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     345.183us      1768.26%     345.183us     345.183us             1  
+                                            torch_eager         6.79%     129.712us        99.69%       1.904ms       1.904ms       0.000us         0.00%      21.633us      21.633us             1  
+                                               aten::to         0.35%       6.752us        84.92%       1.622ms     270.359us       0.000us         0.00%      13.697us       2.283us             6  
+                                         aten::_to_copy         1.29%      24.629us        84.57%       1.615ms     269.234us       0.000us         0.00%      13.697us       2.283us             6  
+                                            aten::copy_         2.57%      49.181us        81.04%       1.548ms     258.016us      11.585us        59.35%      13.697us       2.283us             6  
+                                           aten::conv1d         0.34%       6.520us         6.51%     124.283us      41.428us       0.000us         0.00%       7.936us       2.645us             3  
+                                      aten::convolution         0.52%       9.860us         6.16%     117.763us      39.254us       0.000us         0.00%       7.936us       2.645us             3  
+                                     aten::_convolution         1.28%      24.503us         5.65%     107.903us      35.968us       0.000us         0.00%       7.936us       2.645us             3  
+                                aten::_conv_depthwise2d         1.17%      22.379us         3.49%      66.751us      22.250us       7.936us        40.65%       7.936us       2.645us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.936us        40.65%       7.936us       2.645us             3  
 void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.080us        31.15%       6.080us       2.027us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.536us        28.36%       5.536us       1.845us             3  
-                                Activity Buffer Request        76.19%       1.456ms        76.19%       1.456ms       1.456ms       2.112us        10.82%       2.112us       2.112us             1  
-                                    aten::empty_strided         1.68%      32.131us         1.68%      32.131us       5.355us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.93%      75.003us         3.93%      75.003us       8.334us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.97%      18.540us         1.29%      24.620us       2.736us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.51%       9.711us         0.51%       9.711us       0.647us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%       9.650us         0.51%       9.650us       3.217us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.47%       9.000us         0.47%       9.000us       3.000us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.37%       7.100us         0.45%       8.560us       2.853us       0.000us         0.00%       0.000us       0.000us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.505us        28.20%       5.505us       1.835us             3  
+                                Activity Buffer Request        75.86%       1.449ms        75.86%       1.449ms       1.449ms       2.112us        10.82%       2.112us       2.112us             1  
+                                    aten::empty_strided         2.23%      42.682us         2.23%      42.682us       7.114us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.91%      74.643us         3.91%      74.643us       8.294us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.96%      18.249us         1.26%      24.119us       2.680us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.50%       9.600us         0.50%       9.600us       0.640us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.51%       9.750us         0.51%       9.750us       3.250us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.51%       9.801us         0.51%       9.801us       3.267us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.930us         0.39%       7.450us       2.483us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.911ms
-Self CUDA time total: 19.520us
+Self CPU time total: 1.910ms
+Self CUDA time total: 19.521us
 
 
 
@@ -4046,29 +4046,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     379.390us      2047.55%     379.390us     379.390us             1  
-                                            torch_eager         8.20%     159.835us        99.65%       1.942ms       1.942ms       0.000us         0.00%      20.449us      20.449us             1  
-                                               aten::to         0.37%       7.179us        83.32%       1.624ms     270.686us       0.000us         0.00%      13.536us       2.256us             6  
-                                         aten::_to_copy         1.40%      27.213us        82.96%       1.617ms     269.489us       0.000us         0.00%      13.536us       2.256us             6  
-                                            aten::copy_         2.62%      51.160us        79.92%       1.558ms     259.635us      11.616us        62.69%      13.536us       2.256us             6  
-                                           aten::conv1d         0.34%       6.560us         6.49%     126.453us      42.151us       0.000us         0.00%       6.913us       2.304us             3  
-                                      aten::convolution         0.57%      11.119us         6.15%     119.893us      39.964us       0.000us         0.00%       6.913us       2.304us             3  
-                                     aten::_convolution         1.29%      25.191us         5.58%     108.774us      36.258us       0.000us         0.00%       6.913us       2.304us             3  
-                                aten::_conv_depthwise2d         1.16%      22.580us         3.36%      65.502us      21.834us       6.913us        37.31%       6.913us       2.304us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.913us        37.31%       6.913us       2.304us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        31.95%       5.920us       1.973us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.696us        30.74%       5.696us       1.899us             3  
-                                Activity Buffer Request        74.82%       1.458ms        74.82%       1.458ms       1.458ms       1.920us        10.36%       1.920us       1.920us             1  
-                                    aten::empty_strided         1.64%      31.911us         1.64%      31.911us       5.319us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         3.59%      70.043us         3.59%      70.043us       7.783us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.01%      19.612us         1.35%      26.392us       2.932us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.55%      10.750us         0.55%      10.750us       0.717us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.62%      12.182us         0.62%      12.182us       4.061us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.46%       8.910us         0.46%       8.910us       2.970us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.35%       6.890us         0.42%       8.260us       2.753us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     350.236us      1887.05%     350.236us     350.236us             1  
+                                            torch_eager         6.95%     131.684us        99.72%       1.889ms       1.889ms       0.000us         0.00%      20.481us      20.481us             1  
+                                               aten::to         0.32%       5.979us        84.80%       1.606ms     267.646us       0.000us         0.00%      13.570us       2.262us             6  
+                                         aten::_to_copy         1.26%      23.830us        84.48%       1.600ms     266.649us       0.000us         0.00%      13.570us       2.262us             6  
+                                            aten::copy_         3.12%      59.102us        81.66%       1.546ms     257.734us      11.649us        62.76%      13.570us       2.262us             6  
+                                           aten::conv1d         0.33%       6.189us         6.49%     122.822us      40.941us       0.000us         0.00%       6.911us       2.304us             3  
+                                      aten::convolution         0.53%      10.011us         6.16%     116.633us      38.878us       0.000us         0.00%       6.911us       2.304us             3  
+                                     aten::_convolution         1.28%      24.209us         5.63%     106.622us      35.541us       0.000us         0.00%       6.911us       2.304us             3  
+                                aten::_conv_depthwise2d         1.23%      23.239us         3.44%      65.172us      21.724us       6.911us        37.24%       6.911us       2.304us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       6.911us        37.24%       6.911us       2.304us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.921us        31.90%       5.921us       1.974us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        30.86%       5.728us       1.909us             3  
+                                Activity Buffer Request        75.86%       1.437ms        75.86%       1.437ms       1.437ms       1.921us        10.35%       1.921us       1.921us             1  
+                                    aten::empty_strided         1.57%      29.661us         1.57%      29.661us       4.944us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         3.93%      74.492us         3.93%      74.492us       8.277us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.98%      18.470us         1.28%      24.221us       2.691us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.50%       9.492us         0.50%       9.492us       0.633us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%       9.761us         0.52%       9.761us       3.254us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       8.371us         0.44%       8.371us       2.790us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       5.870us         0.39%       7.390us       2.463us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.949ms
-Self CUDA time total: 18.529us
+Self CPU time total: 1.894ms
+Self CUDA time total: 18.560us
 
 
 
@@ -4078,29 +4078,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.058us      1736.41%     340.058us     340.058us             1  
-                                            torch_eager         6.15%     129.375us        99.74%       2.097ms       2.097ms       0.000us         0.00%      21.760us      21.760us             1  
-                                               aten::to         0.32%       6.700us        86.45%       1.818ms     303.002us       0.000us         0.00%      14.112us       2.352us             6  
-                                         aten::_to_copy         1.17%      24.651us        86.13%       1.811ms     301.886us       0.000us         0.00%      14.112us       2.352us             6  
-                                            aten::copy_         2.42%      50.883us        83.54%       1.757ms     292.785us      11.936us        60.95%      14.112us       2.352us             6  
-                                           aten::conv1d         0.30%       6.290us         5.74%     120.803us      40.268us       0.000us         0.00%       7.648us       2.549us             3  
-                                      aten::convolution         0.48%      10.020us         5.45%     114.513us      38.171us       0.000us         0.00%       7.648us       2.549us             3  
-                                     aten::_convolution         1.15%      24.209us         4.97%     104.493us      34.831us       0.000us         0.00%       7.648us       2.549us             3  
-                                aten::_conv_depthwise2d         1.00%      21.080us         2.93%      61.691us      20.564us       7.648us        39.05%       7.648us       2.549us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.648us        39.05%       7.648us       2.549us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.208us        31.70%       6.208us       2.069us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.728us        29.25%       5.728us       1.909us             3  
-                                Activity Buffer Request        71.15%       1.496ms        71.15%       1.496ms       1.496ms       2.176us        11.11%       2.176us       2.176us             1  
-                                    aten::empty_strided         1.42%      29.951us         1.42%      29.951us       4.992us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.98%     230.807us        10.98%     230.807us      25.645us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.94%      19.863us         1.21%      25.543us       2.838us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.46%       9.630us         0.46%       9.630us       0.642us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.50%      10.541us         0.50%      10.541us       3.514us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.42%       8.810us         0.42%       8.810us       2.937us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.35%       7.411us         0.44%       9.201us       3.067us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     349.473us      1781.39%     349.473us     349.473us             1  
+                                            torch_eager         6.11%     131.525us        99.75%       2.146ms       2.146ms       0.000us         0.00%      21.795us      21.795us             1  
+                                               aten::to         0.31%       6.681us        86.66%       1.864ms     310.738us       0.000us         0.00%      14.148us       2.358us             6  
+                                         aten::_to_copy         1.14%      24.510us        86.35%       1.858ms     309.625us       0.000us         0.00%      14.148us       2.358us             6  
+                                            aten::copy_         2.35%      50.532us        83.71%       1.801ms     300.153us      11.971us        61.02%      14.148us       2.358us             6  
+                                           aten::conv1d         0.29%       6.159us         5.69%     122.482us      40.827us       0.000us         0.00%       7.647us       2.549us             3  
+                                      aten::convolution         0.45%       9.650us         5.41%     116.323us      38.774us       0.000us         0.00%       7.647us       2.549us             3  
+                                     aten::_convolution         1.16%      25.049us         4.96%     106.673us      35.558us       0.000us         0.00%       7.647us       2.549us             3  
+                                aten::_conv_depthwise2d         1.06%      22.843us         3.03%      65.182us      21.727us       7.647us        38.98%       7.647us       2.549us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.647us        38.98%       7.647us       2.549us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.211us        31.66%       6.211us       2.070us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.760us        29.36%       5.760us       1.920us             3  
+                                Activity Buffer Request        68.59%       1.476ms        68.59%       1.476ms       1.476ms       2.177us        11.10%       2.177us       2.177us             1  
+                                    aten::empty_strided         1.50%      32.320us         1.50%      32.320us       5.387us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.84%     297.685us        13.84%     297.685us      33.076us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.81%      17.433us         1.07%      22.952us       2.550us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.42%       9.029us         0.42%       9.029us       0.602us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%      10.100us         0.47%      10.100us       3.367us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       9.389us         0.44%       9.389us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.350us         0.36%       7.690us       2.563us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.103ms
-Self CUDA time total: 19.584us
+Self CPU time total: 2.151ms
+Self CUDA time total: 19.618us
 
 
 
@@ -4110,29 +4110,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.070us      1381.53%     339.070us     339.070us             1  
-                                            torch_eager         6.44%     132.135us        99.72%       2.045ms       2.045ms       0.000us         0.00%      26.814us      26.814us             1  
-                                               aten::to         0.33%       6.722us        86.08%       1.765ms     294.155us       0.000us         0.00%      15.262us       2.544us             6  
-                                         aten::_to_copy         1.20%      24.702us        85.75%       1.758ms     293.035us       0.000us         0.00%      15.262us       2.544us             6  
-                                            aten::copy_         2.39%      49.030us        83.04%       1.702ms     283.750us      12.991us        52.93%      15.262us       2.544us             6  
-                                           aten::conv1d         0.29%       5.850us         5.78%     118.603us      39.534us       0.000us         0.00%      11.552us       3.851us             3  
-                                      aten::convolution         0.55%      11.220us         5.50%     112.753us      37.584us       0.000us         0.00%      11.552us       3.851us             3  
-                                     aten::_convolution         1.18%      24.170us         4.95%     101.533us      33.844us       0.000us         0.00%      11.552us       3.851us             3  
-                                aten::_conv_depthwise2d         1.08%      22.212us         2.99%      61.273us      20.424us      11.552us        47.07%      11.552us       3.851us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.552us        47.07%      11.552us       3.851us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.655us        27.12%       6.655us       2.218us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        25.82%       6.336us       2.112us             3  
-                                Activity Buffer Request        71.25%       1.461ms        71.25%       1.461ms       1.461ms       2.271us         9.25%       2.271us       2.271us             1  
-                                    aten::empty_strided         1.51%      31.010us         1.51%      31.010us       5.168us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.41%     213.527us        10.41%     213.527us      23.725us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      18.350us         1.15%      23.660us       2.629us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.131us         0.45%       9.131us       0.609us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.46%       9.481us         0.46%       9.481us       3.160us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.43%       8.760us         0.43%       8.760us       2.920us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.27%       5.520us         0.33%       6.850us       2.283us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.958us      1429.22%     348.958us     348.958us             1  
+                                            torch_eager         6.64%     139.433us        99.75%       2.094ms       2.094ms       0.000us         0.00%      26.656us      26.656us             1  
+                                               aten::to         0.30%       6.349us        85.85%       1.803ms     300.420us       0.000us         0.00%      15.136us       2.523us             6  
+                                         aten::_to_copy         1.17%      24.664us        85.55%       1.796ms     299.362us       0.000us         0.00%      15.136us       2.523us             6  
+                                            aten::copy_         2.46%      51.670us        82.81%       1.739ms     289.779us      12.896us        52.82%      15.136us       2.523us             6  
+                                           aten::conv1d         0.30%       6.230us         5.89%     123.663us      41.221us       0.000us         0.00%      11.520us       3.840us             3  
+                                      aten::convolution         0.49%      10.211us         5.59%     117.433us      39.144us       0.000us         0.00%      11.520us       3.840us             3  
+                                     aten::_convolution         1.21%      25.350us         5.11%     107.222us      35.741us       0.000us         0.00%      11.520us       3.840us             3  
+                                aten::_conv_depthwise2d         1.07%      22.551us         3.09%      64.932us      21.644us      11.520us        47.18%      11.520us       3.840us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      11.520us        47.18%      11.520us       3.840us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        27.00%       6.592us       2.197us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        25.82%       6.304us       2.101us             3  
+                                Activity Buffer Request        67.91%       1.426ms        67.91%       1.426ms       1.426ms       2.240us         9.17%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.56%      32.829us         1.56%      32.829us       5.472us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.56%     284.686us        13.56%     284.686us      31.632us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.89%      18.631us         1.15%      24.041us       2.671us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       9.180us         0.44%       9.180us       0.612us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.871us         0.47%       9.871us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.930us         0.43%       8.930us       2.977us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.31%       6.480us         0.38%       7.900us       2.633us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.050ms
-Self CUDA time total: 24.543us
+Self CPU time total: 2.100ms
+Self CUDA time total: 24.416us
 
 
 
@@ -4142,29 +4142,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.129us      1305.15%     339.129us     339.129us             1  
-                                            torch_eager         6.29%     128.886us        99.74%       2.043ms       2.043ms       0.000us         0.00%      28.224us      28.224us             1  
-                                               aten::to         0.34%       6.902us        86.10%       1.763ms     293.882us       0.000us         0.00%      15.168us       2.528us             6  
-                                         aten::_to_copy         1.23%      25.190us        85.76%       1.756ms     292.731us       0.000us         0.00%      15.168us       2.528us             6  
-                                            aten::copy_         2.41%      49.270us        83.08%       1.701ms     283.571us      12.928us        49.75%      15.168us       2.528us             6  
-                                           aten::conv1d         0.31%       6.370us         5.92%     121.333us      40.444us       0.000us         0.00%      13.056us       4.352us             3  
-                                      aten::convolution         0.49%      10.120us         5.61%     114.963us      38.321us       0.000us         0.00%      13.056us       4.352us             3  
-                                     aten::_convolution         1.25%      25.500us         5.12%     104.843us      34.948us       0.000us         0.00%      13.056us       4.352us             3  
-                                aten::_conv_depthwise2d         1.08%      22.212us         3.04%      62.243us      20.748us      13.056us        50.25%      13.056us       4.352us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      13.056us        50.25%      13.056us       4.352us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        25.37%       6.592us       2.197us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.336us        24.38%       6.336us       2.112us             3  
-                                Activity Buffer Request        71.41%       1.463ms        71.41%       1.463ms       1.463ms       2.240us         8.62%       2.240us       2.240us             1  
-                                    aten::empty_strided         1.45%      29.770us         1.45%      29.770us       4.962us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        10.25%     209.968us        10.25%     209.968us      23.330us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      18.870us         1.21%      24.780us       2.753us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.47%       9.601us         0.47%       9.601us       0.640us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.51%      10.510us         0.51%      10.510us       3.503us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.45%       9.181us         0.45%       9.181us       3.060us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       6.640us         0.40%       8.140us       2.713us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     350.300us      1353.19%     350.300us     350.300us             1  
+                                            torch_eager         6.16%     129.673us        99.74%       2.100ms       2.100ms       0.000us         0.00%      28.127us      28.127us             1  
+                                               aten::to         0.30%       6.400us        86.28%       1.817ms     302.813us       0.000us         0.00%      15.135us       2.522us             6  
+                                         aten::_to_copy         1.17%      24.572us        85.97%       1.810ms     301.746us       0.000us         0.00%      15.135us       2.522us             6  
+                                            aten::copy_         2.32%      48.831us        83.30%       1.754ms     292.358us      12.895us        49.81%      15.135us       2.522us             6  
+                                           aten::conv1d         0.30%       6.370us         5.91%     124.553us      41.518us       0.000us         0.00%      12.992us       4.331us             3  
+                                      aten::convolution         0.48%      10.021us         5.61%     118.183us      39.394us       0.000us         0.00%      12.992us       4.331us             3  
+                                     aten::_convolution         1.13%      23.790us         5.14%     108.162us      36.054us       0.000us         0.00%      12.992us       4.331us             3  
+                                aten::_conv_depthwise2d         1.15%      24.221us         3.16%      66.582us      22.194us      12.992us        50.19%      12.992us       4.331us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      12.992us        50.19%      12.992us       4.331us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.592us        25.46%       6.592us       2.197us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       6.303us        24.35%       6.303us       2.101us             3  
+                                Activity Buffer Request        68.95%       1.452ms        68.95%       1.452ms       1.452ms       2.240us         8.65%       2.240us       2.240us             1  
+                                    aten::empty_strided         1.51%      31.759us         1.51%      31.759us       5.293us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.13%     276.435us        13.13%     276.435us      30.715us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.91%      19.219us         1.21%      25.491us       2.832us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.48%      10.094us         0.48%      10.094us       0.673us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.932us         0.47%       9.932us       3.311us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.44%       9.289us         0.44%       9.289us       3.096us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.33%       6.860us         0.40%       8.350us       2.783us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.048ms
-Self CUDA time total: 25.984us
+Self CPU time total: 2.106ms
+Self CUDA time total: 25.887us
 
 
 
@@ -4174,29 +4174,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     362.270us       942.63%     362.270us     362.270us             1  
-                                            torch_eager         7.50%     163.876us        99.75%       2.180ms       2.180ms       0.000us         0.00%      40.993us      40.993us             1  
-                                           aten::conv1d         0.34%       7.388us         5.94%     129.794us      43.265us       0.000us         0.00%      22.464us       7.488us             3  
-                                      aten::convolution         0.56%      12.301us         5.60%     122.406us      40.802us       0.000us         0.00%      22.464us       7.488us             3  
-                                     aten::_convolution         1.18%      25.829us         5.04%     110.105us      36.702us       0.000us         0.00%      22.464us       7.488us             3  
-                                aten::_conv_depthwise2d         1.07%      23.371us         2.94%      64.311us      21.437us      22.464us        58.45%      22.464us       7.488us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.464us        58.45%      22.464us       7.488us             3  
-                                               aten::to         0.36%       7.830us        84.95%       1.856ms     309.406us       0.000us         0.00%      18.529us       3.088us             6  
-                                         aten::_to_copy         1.44%      31.560us        84.59%       1.849ms     308.101us       0.000us         0.00%      18.529us       3.088us             6  
-                                            aten::copy_         2.41%      52.633us        81.64%       1.784ms     297.326us      15.968us        41.55%      18.529us       3.088us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.609us        22.40%       8.609us       2.870us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.359us        19.15%       7.359us       2.453us             3  
-                                Activity Buffer Request        65.39%       1.429ms        65.39%       1.429ms       1.429ms       2.561us         6.66%       2.561us       2.561us             1  
-                                    aten::empty_strided         1.51%      33.091us         1.51%      33.091us       5.515us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        14.87%     325.052us        14.87%     325.052us      36.117us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.00%      21.833us         1.21%      26.523us       2.947us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.39%       8.492us         0.39%       8.492us       0.566us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.44%       9.570us         0.44%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.40%       8.750us         0.40%       8.750us       2.917us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.37%       7.980us         0.45%       9.772us       3.257us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     344.670us       904.34%     344.670us     344.670us             1  
+                                            torch_eager         6.27%     130.413us        99.73%       2.076ms       2.076ms       0.000us         0.00%      40.673us      40.673us             1  
+                                           aten::conv1d         0.29%       6.011us         5.81%     120.902us      40.301us       0.000us         0.00%      22.369us       7.456us             3  
+                                      aten::convolution         0.46%       9.579us         5.52%     114.891us      38.297us       0.000us         0.00%      22.369us       7.456us             3  
+                                     aten::_convolution         1.17%      24.271us         5.06%     105.312us      35.104us       0.000us         0.00%      22.369us       7.456us             3  
+                                aten::_conv_depthwise2d         1.07%      22.281us         3.10%      64.540us      21.513us      22.369us        58.69%      22.369us       7.456us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.369us        58.69%      22.369us       7.456us             3  
+                                               aten::to         0.30%       6.240us        86.29%       1.796ms     299.368us       0.000us         0.00%      18.304us       3.051us             6  
+                                         aten::_to_copy         1.19%      24.702us        86.00%       1.790ms     298.328us       0.000us         0.00%      18.304us       3.051us             6  
+                                            aten::copy_         2.32%      48.271us        83.37%       1.735ms     289.226us      15.744us        41.31%      18.304us       3.051us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.352us        21.91%       8.352us       2.784us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        19.39%       7.392us       2.464us             3  
+                                Activity Buffer Request        69.09%       1.438ms        69.09%       1.438ms       1.438ms       2.560us         6.72%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.44%      29.909us         1.44%      29.909us       4.985us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        13.10%     272.705us        13.10%     272.705us      30.301us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.90%      18.821us         1.17%      24.281us       2.698us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.910us         0.43%       8.910us       0.594us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.769us         0.47%       9.769us       3.256us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       8.830us         0.42%       8.830us       2.943us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.180us         0.36%       7.570us       2.523us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.185ms
-Self CUDA time total: 38.432us
+Self CPU time total: 2.081ms
+Self CUDA time total: 38.113us
 
 
 
@@ -4206,29 +4206,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.836us       827.74%     339.836us     339.836us             1  
-                                            torch_eager         6.54%     141.434us        99.74%       2.158ms       2.158ms       0.000us         0.00%      43.648us      43.648us             1  
-                                           aten::conv1d         0.28%       6.090us         5.53%     119.574us      39.858us       0.000us         0.00%      25.407us       8.469us             3  
-                                      aten::convolution         0.46%       9.939us         5.25%     113.484us      37.828us       0.000us         0.00%      25.407us       8.469us             3  
-                                     aten::_convolution         1.12%      24.214us         4.79%     103.545us      34.515us       0.000us         0.00%      25.407us       8.469us             3  
-                                aten::_conv_depthwise2d         1.05%      22.612us         2.94%      63.593us      21.198us      25.407us        61.88%      25.407us       8.469us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.407us        61.88%      25.407us       8.469us             3  
-                                               aten::to         0.29%       6.201us        86.38%       1.869ms     311.424us       0.000us         0.00%      18.241us       3.040us             6  
-                                         aten::_to_copy         1.18%      25.424us        86.09%       1.862ms     310.391us       0.000us         0.00%      18.241us       3.040us             6  
-                                            aten::copy_         2.40%      51.862us        83.52%       1.807ms     301.107us      15.649us        38.12%      18.241us       3.040us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.320us        20.27%       8.320us       2.773us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.329us        17.85%       7.329us       2.443us             3  
-                                Activity Buffer Request        68.07%       1.472ms        68.07%       1.472ms       1.472ms       2.592us         6.31%       2.592us       2.592us             1  
-                                    aten::empty_strided         1.40%      30.280us         1.40%      30.280us       5.047us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        14.06%     304.169us        14.06%     304.169us      33.797us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.84%      18.230us         1.08%      23.418us       2.602us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.40%       8.619us         0.40%       8.619us       0.575us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%      10.370us         0.48%      10.370us       3.457us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.41%       8.770us         0.41%       8.770us       2.923us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.26%       5.659us         0.32%       6.990us       2.330us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.324us       829.06%     339.324us     339.324us             1  
+                                            torch_eager         6.37%     130.712us        99.71%       2.046ms       2.046ms       0.000us         0.00%      43.521us      43.521us             1  
+                                           aten::conv1d         0.29%       5.880us         5.94%     121.953us      40.651us       0.000us         0.00%      25.216us       8.405us             3  
+                                      aten::convolution         0.47%       9.711us         5.66%     116.073us      38.691us       0.000us         0.00%      25.216us       8.405us             3  
+                                     aten::_convolution         1.26%      25.911us         5.18%     106.362us      35.454us       0.000us         0.00%      25.216us       8.405us             3  
+                                aten::_conv_depthwise2d         1.09%      22.379us         3.16%      64.832us      21.611us      25.216us        61.61%      25.216us       8.405us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      25.216us        61.61%      25.216us       8.405us             3  
+                                               aten::to         0.29%       5.930us        86.07%       1.766ms     294.313us       0.000us         0.00%      18.305us       3.051us             6  
+                                         aten::_to_copy         1.14%      23.292us        85.79%       1.760ms     293.325us       0.000us         0.00%      18.305us       3.051us             6  
+                                            aten::copy_         2.44%      50.149us        83.18%       1.707ms     284.430us      15.713us        38.39%      18.305us       3.051us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.320us        20.33%       8.320us       2.773us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.393us        18.06%       7.393us       2.464us             3  
+                                Activity Buffer Request        69.04%       1.416ms        69.04%       1.416ms       1.416ms       2.592us         6.33%       2.592us       2.592us             1  
+                                    aten::empty_strided         1.47%      30.081us         1.47%      30.081us       5.013us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.82%     263.078us        12.82%     263.078us      29.231us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.84%      17.161us         1.08%      22.249us       2.472us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.43%       8.738us         0.43%       8.738us       0.583us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.52%      10.621us         0.52%      10.621us       3.540us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.43%       8.801us         0.43%       8.801us       2.934us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.28%       5.670us         0.35%       7.160us       2.387us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.163ms
-Self CUDA time total: 41.056us
+Self CPU time total: 2.052ms
+Self CUDA time total: 40.929us
 
 
 
@@ -4238,29 +4238,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     338.560us       329.80%     338.560us     338.560us             1  
-                                            torch_eager         6.25%     131.427us        99.74%       2.098ms       2.098ms       0.000us         0.00%     108.608us     108.608us             1  
-                                           aten::conv1d         0.29%       6.110us         5.71%     120.083us      40.028us       0.000us         0.00%      70.496us      23.499us             3  
-                                      aten::convolution         0.47%       9.940us         5.42%     113.973us      37.991us       0.000us         0.00%      70.496us      23.499us             3  
-                                     aten::_convolution         1.11%      23.441us         4.94%     104.033us      34.678us       0.000us         0.00%      70.496us      23.499us             3  
-                                aten::_conv_depthwise2d         1.04%      21.830us         2.93%      61.652us      20.551us      70.496us        68.67%      70.496us      23.499us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.496us        68.67%      70.496us      23.499us             3  
-                                               aten::to         0.30%       6.292us        86.43%       1.818ms     303.059us       0.000us         0.00%      38.112us       6.352us             6  
-                                         aten::_to_copy         1.17%      24.539us        86.13%       1.812ms     302.010us       0.000us         0.00%      38.112us       6.352us             6  
-                                            aten::copy_         2.47%      51.869us        83.58%       1.758ms     293.072us      32.160us        31.33%      38.112us       6.352us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.568us        17.11%      17.568us       5.856us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.592us        14.21%      14.592us       4.864us             3  
-                                Activity Buffer Request        67.63%       1.423ms        67.63%       1.423ms       1.423ms       5.952us         5.80%       5.952us       5.952us             1  
-                                    aten::empty_strided         1.38%      29.091us         1.38%      29.091us       4.849us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        14.47%     304.542us        14.47%     304.542us      33.838us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.91%      19.049us         1.17%      24.579us       2.731us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.43%       9.070us         0.43%       9.070us       0.605us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.49%      10.351us         0.49%      10.351us       3.450us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.41%       8.621us         0.41%       8.621us       2.874us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.38%       8.050us         0.45%       9.470us       3.157us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     346.587us       339.00%     346.587us     346.587us             1  
+                                            torch_eager         6.07%     126.382us        99.75%       2.075ms       2.075ms       0.000us         0.00%     108.223us     108.223us             1  
+                                           aten::conv1d         0.27%       5.689us         5.84%     121.563us      40.521us       0.000us         0.00%      70.111us      23.370us             3  
+                                      aten::convolution         0.45%       9.432us         5.57%     115.874us      38.625us       0.000us         0.00%      70.111us      23.370us             3  
+                                     aten::_convolution         1.15%      23.992us         5.12%     106.442us      35.481us       0.000us         0.00%      70.111us      23.370us             3  
+                                aten::_conv_depthwise2d         1.13%      23.510us         3.19%      66.451us      22.150us      70.111us        68.58%      70.111us      23.370us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      70.111us        68.58%      70.111us      23.370us             3  
+                                               aten::to         0.33%       6.762us        86.49%       1.799ms     299.916us       0.000us         0.00%      38.112us       6.352us             6  
+                                         aten::_to_copy         1.17%      24.419us        86.17%       1.793ms     298.789us       0.000us         0.00%      38.112us       6.352us             6  
+                                            aten::copy_         2.24%      46.671us        83.54%       1.738ms     289.665us      32.128us        31.42%      38.112us       6.352us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.568us        17.18%      17.568us       5.856us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.560us        14.24%      14.560us       4.853us             3  
+                                Activity Buffer Request        69.85%       1.453ms        69.85%       1.453ms       1.453ms       5.984us         5.85%       5.984us       5.984us             1  
+                                    aten::empty_strided         1.46%      30.330us         1.46%      30.330us       5.055us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.58%     261.816us        12.58%     261.816us      29.091us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      18.161us         1.14%      23.661us       2.629us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       9.119us         0.44%       9.119us       0.608us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.940us         0.48%       9.940us       3.313us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.280us         0.45%       9.280us       3.093us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.27%       5.680us         0.34%       7.119us       2.373us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.104ms
-Self CUDA time total: 102.656us
+Self CPU time total: 2.081ms
+Self CUDA time total: 102.239us
 
 
 
@@ -4270,29 +4270,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.578us       301.93%     340.578us     340.578us             1  
-                                            torch_eager         6.29%     133.214us        99.74%       2.113ms       2.113ms       0.000us         0.00%     118.752us     118.752us             1  
-                                           aten::conv1d         0.31%       6.499us         5.66%     119.974us      39.991us       0.000us         0.00%      80.576us      26.859us             3  
-                                      aten::convolution         0.47%       9.880us         5.36%     113.475us      37.825us       0.000us         0.00%      80.576us      26.859us             3  
-                                     aten::_convolution         1.21%      25.730us         4.89%     103.595us      34.532us       0.000us         0.00%      80.576us      26.859us             3  
-                                aten::_conv_depthwise2d         1.01%      21.361us         2.87%      60.832us      20.277us      80.576us        71.43%      80.576us      26.859us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.576us        71.43%      80.576us      26.859us             3  
-                                               aten::to         0.33%       7.060us        86.42%       1.831ms     305.149us       0.000us         0.00%      38.176us       6.363us             6  
-                                         aten::_to_copy         1.15%      24.352us        86.09%       1.824ms     303.972us       0.000us         0.00%      38.176us       6.363us             6  
-                                            aten::copy_         2.34%      49.642us        83.57%       1.770ms     295.075us      32.224us        28.57%      38.176us       6.363us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.664us        15.66%      17.664us       5.888us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.560us        12.91%      14.560us       4.853us             3  
-                                Activity Buffer Request        68.62%       1.454ms        68.62%       1.454ms       1.454ms       5.952us         5.28%       5.952us       5.952us             1  
-                                    aten::empty_strided         1.37%      29.031us         1.37%      29.031us       4.838us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        13.59%     287.970us        13.59%     287.970us      31.997us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.89%      18.772us         1.17%      24.871us       2.763us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.520us         0.45%       9.520us       0.635us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.46%       9.850us         0.46%       9.850us       3.283us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.41%       8.670us         0.41%       8.670us       2.890us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.32%       6.821us         0.38%       8.112us       2.704us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.895us       303.69%     340.895us     340.895us             1  
+                                            torch_eager         5.96%     121.922us        99.72%       2.040ms       2.040ms       0.000us         0.00%     118.204us     118.204us             1  
+                                           aten::conv1d         0.29%       5.851us         5.96%     121.923us      40.641us       0.000us         0.00%      80.190us      26.730us             3  
+                                      aten::convolution         0.47%       9.659us         5.67%     116.072us      38.691us       0.000us         0.00%      80.190us      26.730us             3  
+                                     aten::_convolution         1.15%      23.552us         5.20%     106.413us      35.471us       0.000us         0.00%      80.190us      26.730us             3  
+                                aten::_conv_depthwise2d         1.14%      23.240us         3.13%      64.041us      21.347us      80.190us        71.44%      80.190us      26.730us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      80.190us        71.44%      80.190us      26.730us             3  
+                                               aten::to         0.30%       6.190us        86.45%       1.769ms     294.821us       0.000us         0.00%      38.014us       6.336us             6  
+                                         aten::_to_copy         1.15%      23.531us        86.15%       1.763ms     293.790us       0.000us         0.00%      38.014us       6.336us             6  
+                                            aten::copy_         2.44%      49.841us        83.49%       1.708ms     284.726us      32.062us        28.56%      38.014us       6.336us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      17.503us        15.59%      17.503us       5.834us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      14.559us        12.97%      14.559us       4.853us             3  
+                                Activity Buffer Request        69.71%       1.426ms        69.71%       1.426ms       1.426ms       5.952us         5.30%       5.952us       5.952us             1  
+                                    aten::empty_strided         1.51%      30.850us         1.51%      30.850us       5.142us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.43%     254.276us        12.43%     254.276us      28.253us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.92%      18.780us         1.20%      24.600us       2.733us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.47%       9.541us         0.47%       9.541us       0.636us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.49%      10.090us         0.49%      10.090us       3.363us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.42%       8.680us         0.42%       8.680us       2.893us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.34%       6.870us         0.41%       8.300us       2.767us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.119ms
-Self CUDA time total: 112.800us
+Self CPU time total: 2.046ms
+Self CUDA time total: 112.252us
 
 
 
@@ -4302,29 +4302,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.32%     133.665us        99.60%       2.106ms       2.106ms       0.000us         0.00%     433.181us     433.181us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     423.869us       107.93%     423.869us     423.869us             1  
-                                           aten::conv1d         0.30%       6.441us         5.98%     126.475us      42.158us       0.000us         0.00%     252.190us      84.063us             3  
-                                      aten::convolution         0.49%      10.391us         5.68%     120.034us      40.011us       0.000us         0.00%     252.190us      84.063us             3  
-                                     aten::_convolution         1.19%      25.110us         5.19%     109.643us      36.548us       0.000us         0.00%     252.190us      84.063us             3  
-                                aten::_conv_depthwise2d         1.07%      22.550us         3.14%      66.363us      22.121us     252.190us        64.21%     252.190us      84.063us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     252.190us        64.21%     252.190us      84.063us             3  
-                                               aten::to         0.33%       6.989us        85.86%       1.815ms     302.520us       0.000us         0.00%     180.991us      30.165us             6  
-                                         aten::_to_copy         1.18%      24.921us        85.53%       1.808ms     301.355us       0.000us         0.00%     180.991us      30.165us             6  
-                                            aten::copy_         2.39%      50.532us        82.93%       1.753ms     292.204us     140.543us        35.79%     180.991us      30.165us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     100.768us        25.66%     100.768us      33.589us             3  
-                                Activity Buffer Request        67.47%       1.426ms        67.47%       1.426ms       1.426ms      40.448us        10.30%      40.448us      40.448us             1  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.775us        10.13%      39.775us      13.258us             3  
-                                    aten::empty_strided         1.42%      29.990us         1.42%      29.990us       4.998us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        14.15%     299.142us        14.15%     299.142us      33.238us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      19.400us         1.21%      25.500us       2.833us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.49%      10.430us         0.49%      10.430us       0.695us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.55%      11.580us         0.55%      11.580us       3.860us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.44%       9.361us         0.44%       9.361us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       7.110us         0.42%       8.900us       2.967us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.19%     128.263us        98.22%       2.035ms       2.035ms       0.000us         0.00%     432.800us     432.800us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     422.880us       107.61%     422.880us     422.880us             1  
+                                           aten::conv1d         0.31%       6.390us         5.89%     122.123us      40.708us       0.000us         0.00%     250.976us      83.659us             3  
+                                      aten::convolution         0.46%       9.601us         5.59%     115.733us      38.578us       0.000us         0.00%     250.976us      83.659us             3  
+                                     aten::_convolution         1.21%      25.079us         5.12%     106.132us      35.377us       0.000us         0.00%     250.976us      83.659us             3  
+                                aten::_conv_depthwise2d         1.14%      23.570us         3.11%      64.391us      21.464us     250.976us        63.87%     250.976us      83.659us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     250.976us        63.87%     250.976us      83.659us             3  
+                                               aten::to         0.32%       6.560us        84.78%       1.757ms     292.786us       0.000us         0.00%     181.824us      30.304us             6  
+                                         aten::_to_copy         1.18%      24.419us        84.47%       1.750ms     291.693us       0.000us         0.00%     181.824us      30.304us             6  
+                                            aten::copy_         2.44%      50.653us        81.84%       1.696ms     282.633us     141.984us        36.13%     181.824us      30.304us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     102.336us        26.04%     102.336us      34.112us             3  
+                                Activity Buffer Request        68.54%       1.420ms        68.54%       1.420ms       1.420ms      39.840us        10.14%      39.840us      39.840us             1  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.648us        10.09%      39.648us      13.216us             3  
+                                    aten::empty_strided         1.44%      29.940us         1.44%      29.940us       4.990us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.90%     246.595us        11.90%     246.595us      27.399us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.85%      17.582us         1.11%      22.961us       2.551us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       9.098us         0.44%       9.098us       0.607us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%       9.750us         0.47%       9.750us       3.250us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.45%       9.420us         0.45%       9.420us       3.140us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.712us         0.39%       8.162us       2.721us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.114ms
-Self CUDA time total: 392.733us
+Self CPU time total: 2.072ms
+Self CUDA time total: 392.960us
 
 
 
@@ -4334,29 +4334,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         6.65%     143.166us        97.03%       2.090ms       2.090ms       0.000us         0.00%     486.301us     486.301us             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     477.853us       106.88%     477.853us     477.853us             1  
-                                           aten::conv1d         0.33%       7.110us         5.88%     126.575us      42.192us       0.000us         0.00%     298.557us      99.519us             3  
-                                      aten::convolution         0.51%      11.062us         5.55%     119.465us      39.822us       0.000us         0.00%     298.557us      99.519us             3  
-                                     aten::_convolution         1.16%      25.071us         5.03%     108.403us      36.134us       0.000us         0.00%     298.557us      99.519us             3  
-                                aten::_conv_depthwise2d         1.05%      22.671us         3.05%      65.592us      21.864us     298.557us        66.78%     298.557us      99.519us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     298.557us        66.78%     298.557us      99.519us             3  
-                                               aten::to         0.33%       7.030us        83.12%       1.790ms     298.407us       0.000us         0.00%     187.744us      31.291us             6  
-                                         aten::_to_copy         1.22%      26.183us        82.80%       1.783ms     297.235us       0.000us         0.00%     187.744us      31.291us             6  
-                                            aten::copy_         2.41%      51.979us        80.11%       1.726ms     287.603us     148.544us        33.22%     187.744us      31.291us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     108.768us        24.33%     108.768us      36.256us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.776us         8.90%      39.776us      13.259us             3  
-                                Activity Buffer Request        66.10%       1.424ms        66.10%       1.424ms       1.424ms      39.200us         8.77%      39.200us      39.200us             1  
-                                    aten::empty_strided         1.47%      31.611us         1.47%      31.611us       5.268us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        12.61%     271.569us        12.61%     271.569us      30.174us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.93%      19.971us         1.21%      26.011us       2.890us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.45%       9.711us         0.45%       9.711us       0.647us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.47%      10.061us         0.47%      10.061us       3.354us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.51%      11.040us         0.51%      11.040us       3.680us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.28%       5.950us         0.34%       7.400us       2.467us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         5.97%     128.995us        95.86%       2.073ms       2.073ms       0.000us         0.00%     487.835us     487.835us             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     481.723us       107.54%     481.723us     481.723us             1  
+                                           aten::conv1d         0.29%       6.320us         5.75%     124.323us      41.441us       0.000us         0.00%     300.092us     100.031us             3  
+                                      aten::convolution         0.47%      10.180us         5.46%     118.003us      39.334us       0.000us         0.00%     300.092us     100.031us             3  
+                                     aten::_convolution         1.09%      23.583us         4.99%     107.823us      35.941us       0.000us         0.00%     300.092us     100.031us             3  
+                                aten::_conv_depthwise2d         1.05%      22.771us         3.07%      66.451us      22.150us     300.092us        67.00%     300.092us     100.031us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     300.092us        67.00%     300.092us     100.031us             3  
+                                               aten::to         0.32%       6.900us        82.82%       1.791ms     298.496us       0.000us         0.00%     187.743us      31.290us             6  
+                                         aten::_to_copy         1.13%      24.450us        82.50%       1.784ms     297.346us       0.000us         0.00%     187.743us      31.290us             6  
+                                            aten::copy_         2.37%      51.149us        79.94%       1.729ms     288.123us     147.839us        33.00%     187.743us      31.290us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     107.872us        24.08%     107.872us      35.957us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      39.967us         8.92%      39.967us      13.322us             3  
+                                Activity Buffer Request        67.34%       1.456ms        67.34%       1.456ms       1.456ms      39.904us         8.91%      39.904us      39.904us             1  
+                                    aten::empty_strided         1.43%      30.891us         1.43%      30.891us       5.149us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.33%     245.015us        11.33%     245.015us      27.224us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.88%      18.930us         1.15%      24.910us       2.768us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.45%       9.739us         0.45%       9.739us       0.649us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.47%      10.080us         0.47%      10.080us       3.360us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.46%       9.870us         0.46%       9.870us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.30%       6.460us         0.36%       7.889us       2.630us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.154ms
-Self CUDA time total: 447.101us
+Self CPU time total: 2.162ms
+Self CUDA time total: 447.931us
 
 
 
@@ -4366,29 +4366,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     355.165us      1897.25%     355.165us     355.165us             1  
-                                            torch_eager        15.24%     136.376us        99.32%     888.600us     888.600us       0.000us         0.00%      20.608us      20.608us             1  
-                                               aten::to         0.80%       7.121us        66.93%     598.831us      99.805us       0.000us         0.00%      13.376us       2.229us             6  
-                                         aten::_to_copy         2.95%      26.380us        66.13%     591.710us      98.618us       0.000us         0.00%      13.376us       2.229us             6  
-                                            aten::copy_         5.90%      52.793us        59.34%     530.948us      88.491us      11.488us        61.37%      13.376us       2.229us             6  
-                                           aten::conv1d         0.68%       6.050us        13.88%     124.163us      41.388us       0.000us         0.00%       7.232us       2.411us             3  
-                                      aten::convolution         1.23%      10.987us        13.20%     118.113us      39.371us       0.000us         0.00%       7.232us       2.411us             3  
-                                     aten::_convolution         2.78%      24.854us        11.97%     107.126us      35.709us       0.000us         0.00%       7.232us       2.411us             3  
-                                aten::_conv_depthwise2d         2.73%      24.470us         7.32%      65.481us      21.827us       7.232us        38.63%       7.232us       2.411us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.232us        38.63%       7.232us       2.411us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.920us        31.62%       5.920us       1.973us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.568us        29.74%       5.568us       1.856us             3  
-                                Activity Buffer Request        26.68%     238.708us        26.68%     238.708us     238.708us       1.888us        10.09%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.84%      34.382us         3.84%      34.382us       5.730us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        29.10%     260.398us        29.10%     260.398us      28.933us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.02%      18.071us         2.57%      22.961us       2.551us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.97%       8.709us         0.97%       8.709us       0.581us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.22%      10.910us         1.22%      10.910us       3.637us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.02%       9.150us         1.02%       9.150us       3.050us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       6.751us         0.92%       8.220us       2.740us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     336.383us      1799.90%     336.383us     336.383us             1  
+                                            torch_eager        14.70%     125.222us        99.35%     846.539us     846.539us       0.000us         0.00%      20.577us      20.577us             1  
+                                               aten::to         0.73%       6.218us        67.41%     574.433us      95.739us       0.000us         0.00%      13.344us       2.224us             6  
+                                         aten::_to_copy         2.79%      23.792us        66.68%     568.215us      94.702us       0.000us         0.00%      13.344us       2.224us             6  
+                                            aten::copy_         6.26%      53.310us        60.38%     514.471us      85.745us      11.456us        61.30%      13.344us       2.224us             6  
+                                           aten::conv1d         0.70%       5.960us        14.03%     119.583us      39.861us       0.000us         0.00%       7.233us       2.411us             3  
+                                      aten::convolution         1.15%       9.760us        13.33%     113.623us      37.874us       0.000us         0.00%       7.233us       2.411us             3  
+                                     aten::_convolution         2.80%      23.881us        12.19%     103.863us      34.621us       0.000us         0.00%       7.233us       2.411us             3  
+                                aten::_conv_depthwise2d         2.66%      22.671us         7.54%      64.252us      21.417us       7.233us        38.70%       7.233us       2.411us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.233us        38.70%       7.233us       2.411us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        31.33%       5.856us       1.952us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.600us        29.96%       5.600us       1.867us             3  
+                                Activity Buffer Request        28.19%     240.205us        28.19%     240.205us     240.205us       1.888us        10.10%       1.888us       1.888us             1  
+                                    aten::empty_strided         3.52%      29.952us         3.52%      29.952us       4.992us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        28.71%     244.666us        28.71%     244.666us      27.185us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.06%      17.560us         2.66%      22.660us       2.518us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       8.590us         1.01%       8.590us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.10%       9.390us         1.10%       9.390us       3.130us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.00%       8.481us         1.00%       8.481us       2.827us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       6.030us         0.87%       7.390us       2.463us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 894.710us
-Self CUDA time total: 18.720us
+Self CPU time total: 852.119us
+Self CUDA time total: 18.689us
 
 
 
@@ -4398,29 +4398,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     323.578us      1674.05%     323.578us     323.578us             1  
-                                            torch_eager        14.45%     120.436us        99.39%     828.559us     828.559us       0.000us         0.00%      21.217us      21.217us             1  
-                                               aten::to         0.75%       6.271us        67.77%     564.939us      94.156us       0.000us         0.00%      13.377us       2.230us             6  
-                                         aten::_to_copy         2.76%      22.992us        67.02%     558.668us      93.111us       0.000us         0.00%      13.377us       2.230us             6  
-                                            aten::copy_         5.96%      49.722us        60.74%     506.327us      84.388us      11.489us        59.44%      13.377us       2.230us             6  
-                                           aten::conv1d         0.75%       6.211us        13.83%     115.254us      38.418us       0.000us         0.00%       7.840us       2.613us             3  
-                                      aten::convolution         1.19%       9.930us        13.08%     109.043us      36.348us       0.000us         0.00%       7.840us       2.613us             3  
-                                     aten::_convolution         2.77%      23.131us        11.89%      99.113us      33.038us       0.000us         0.00%       7.840us       2.613us             3  
-                                aten::_conv_depthwise2d         2.53%      21.092us         7.21%      60.132us      20.044us       7.840us        40.56%       7.840us       2.613us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us        40.56%       7.840us       2.613us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.857us        30.30%       5.857us       1.952us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.632us        29.14%       5.632us       1.877us             3  
-                                Activity Buffer Request        27.26%     227.207us        27.26%     227.207us     227.207us       1.888us         9.77%       1.888us       1.888us             1  
-                                    aten::empty_strided         3.52%      29.349us         3.52%      29.349us       4.891us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        29.92%     249.418us        29.92%     249.418us      27.713us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.13%      17.749us         2.80%      23.370us       2.597us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.11%       9.261us         1.11%       9.261us       0.617us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.16%       9.660us         1.16%       9.660us       3.220us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.12%       9.360us         1.12%       9.360us       3.120us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.70%       5.810us         0.88%       7.370us       2.457us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     337.210us      1736.14%     337.210us     337.210us             1  
+                                            torch_eager        14.23%     122.135us        99.33%     852.339us     852.339us       0.000us         0.00%      21.279us      21.279us             1  
+                                               aten::to         0.69%       5.930us        67.81%     581.931us      96.989us       0.000us         0.00%      13.375us       2.229us             6  
+                                         aten::_to_copy         2.77%      23.810us        67.12%     576.001us      96.000us       0.000us         0.00%      13.375us       2.229us             6  
+                                            aten::copy_         5.94%      51.010us        60.80%     521.760us      86.960us      11.519us        59.31%      13.375us       2.229us             6  
+                                           aten::conv1d         0.66%       5.690us        14.16%     121.503us      40.501us       0.000us         0.00%       7.904us       2.635us             3  
+                                      aten::convolution         1.11%       9.501us        13.50%     115.813us      38.604us       0.000us         0.00%       7.904us       2.635us             3  
+                                     aten::_convolution         3.02%      25.902us        12.39%     106.312us      35.437us       0.000us         0.00%       7.904us       2.635us             3  
+                                aten::_conv_depthwise2d         2.72%      23.320us         7.45%      63.940us      21.313us       7.904us        40.69%       7.904us       2.635us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        40.69%       7.904us       2.635us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       5.855us        30.14%       5.855us       1.952us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.664us        29.16%       5.664us       1.888us             3  
+                                Activity Buffer Request        28.95%     248.446us        28.95%     248.446us     248.446us       1.856us         9.56%       1.856us       1.856us             1  
+                                    aten::empty_strided         3.55%      30.431us         3.55%      30.431us       5.072us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        28.34%     243.224us        28.34%     243.224us      27.025us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         1.98%      16.978us         2.58%      22.140us       2.460us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.00%       8.573us         1.00%       8.573us       0.572us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.13%       9.660us         1.13%       9.660us       3.220us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.17%      10.040us         1.17%      10.040us       3.347us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.78%       6.699us         0.93%       7.990us       2.663us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 833.619us
-Self CUDA time total: 19.329us
+Self CPU time total: 858.129us
+Self CUDA time total: 19.423us
 
 
 
@@ -4430,29 +4430,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     326.394us      1677.60%     326.394us     326.394us             1  
-                                            torch_eager        14.78%     122.914us        99.34%     825.919us     825.919us       0.000us         0.00%      21.632us      21.632us             1  
-                                               aten::to         0.79%       6.552us        67.16%     558.381us      93.064us       0.000us         0.00%      14.368us       2.395us             6  
-                                         aten::_to_copy         2.94%      24.430us        66.37%     551.829us      91.971us       0.000us         0.00%      14.368us       2.395us             6  
-                                            aten::copy_         5.83%      48.462us        59.95%     498.427us      83.071us      12.192us        62.66%      14.368us       2.395us             6  
-                                           aten::conv1d         0.71%       5.939us        14.00%     116.404us      38.801us       0.000us         0.00%       7.264us       2.421us             3  
-                                      aten::convolution         1.18%       9.811us        13.29%     110.465us      36.822us       0.000us         0.00%       7.264us       2.421us             3  
-                                     aten::_convolution         2.85%      23.732us        12.11%     100.654us      33.551us       0.000us         0.00%       7.264us       2.421us             3  
-                                aten::_conv_depthwise2d         2.52%      20.910us         7.24%      60.232us      20.077us       7.264us        37.34%       7.264us       2.421us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.264us        37.34%       7.264us       2.421us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.304us        32.40%       6.304us       2.101us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        30.26%       5.888us       1.963us             3  
-                                Activity Buffer Request        26.68%     221.788us        26.68%     221.788us     221.788us       2.176us        11.18%       2.176us       2.176us             1  
-                                    aten::empty_strided         3.48%      28.972us         3.48%      28.972us       4.829us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        30.05%     249.819us        30.05%     249.819us      27.758us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.04%      16.929us         2.67%      22.200us       2.467us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       8.901us         1.07%       8.901us       0.593us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.15%       9.570us         1.15%       9.570us       3.190us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.98%       8.110us         0.98%       8.110us       2.703us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.86%       7.190us         1.02%       8.500us       2.833us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     340.632us      1762.47%     340.632us     340.632us             1  
+                                            torch_eager        14.22%     122.814us        99.35%     857.879us     857.879us       0.000us         0.00%      21.503us      21.503us             1  
+                                               aten::to         0.71%       6.160us        68.06%     587.732us      97.955us       0.000us         0.00%      14.304us       2.384us             6  
+                                         aten::_to_copy         2.69%      23.228us        67.35%     581.572us      96.929us       0.000us         0.00%      14.304us       2.384us             6  
+                                            aten::copy_         5.95%      51.401us        60.88%     525.681us      87.614us      12.128us        62.75%      14.304us       2.384us             6  
+                                           aten::conv1d         0.72%       6.190us        13.86%     119.652us      39.884us       0.000us         0.00%       7.199us       2.400us             3  
+                                      aten::convolution         1.11%       9.620us        13.14%     113.462us      37.821us       0.000us         0.00%       7.199us       2.400us             3  
+                                     aten::_convolution         2.71%      23.420us        12.03%     103.842us      34.614us       0.000us         0.00%       7.199us       2.400us             3  
+                                aten::_conv_depthwise2d         2.67%      23.041us         7.39%      63.831us      21.277us       7.199us        37.25%       7.199us       2.400us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.199us        37.25%       7.199us       2.400us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.272us        32.45%       6.272us       2.091us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.856us        30.30%       5.856us       1.952us             3  
+                                Activity Buffer Request        29.60%     255.626us        29.60%     255.626us     255.626us       2.176us        11.26%       2.176us       2.176us             1  
+                                    aten::empty_strided         3.78%      32.663us         3.78%      32.663us       5.444us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        27.93%     241.174us        27.93%     241.174us      26.797us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.07%      17.891us         2.69%      23.211us       2.579us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.04%       8.951us         1.04%       8.951us       0.597us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.14%       9.880us         1.14%       9.880us       3.293us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.97%       8.390us         0.97%       8.390us       2.797us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.77%       6.630us         0.93%       8.000us       2.667us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 831.399us
-Self CUDA time total: 19.456us
+Self CPU time total: 863.509us
+Self CUDA time total: 19.327us
 
 
 
@@ -4462,29 +4462,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     356.696us      1774.96%     356.696us     356.696us             1  
-                                            torch_eager        13.86%     123.804us        99.36%     887.440us     887.440us       0.000us         0.00%      22.272us      22.272us             1  
-                                               aten::to         0.71%       6.320us        66.62%     595.061us      99.177us       0.000us         0.00%      14.368us       2.395us             6  
-                                         aten::_to_copy         2.82%      25.151us        65.92%     588.741us      98.124us       0.000us         0.00%      14.368us       2.395us             6  
-                                            aten::copy_         5.73%      51.172us        59.67%     532.958us      88.826us      12.192us        60.67%      14.368us       2.395us             6  
-                                           aten::conv1d         0.70%       6.210us        15.70%     140.195us      46.732us       0.000us         0.00%       7.904us       2.635us             3  
-                                      aten::convolution         1.11%       9.881us        15.00%     133.985us      44.662us       0.000us         0.00%       7.904us       2.635us             3  
-                                     aten::_convolution         2.74%      24.510us        13.89%     124.104us      41.368us       0.000us         0.00%       7.904us       2.635us             3  
-                                aten::_conv_depthwise2d         2.70%      24.090us         9.26%      82.742us      27.581us       7.904us        39.33%       7.904us       2.635us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us        39.33%       7.904us       2.635us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.240us        31.05%       6.240us       2.080us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.952us        29.62%       5.952us       1.984us             3  
-                                Activity Buffer Request        28.94%     258.459us        28.94%     258.459us     258.459us       2.176us        10.83%       2.176us       2.176us             1  
-                                    aten::empty_strided         3.43%      30.632us         3.43%      30.632us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        29.46%     263.129us        29.46%     263.129us      29.237us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.97%      17.620us         2.61%      23.310us       2.590us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.07%       9.580us         1.07%       9.580us       0.639us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.09%       9.720us         1.09%       9.720us       3.240us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.02%       9.130us         1.02%       9.130us       3.043us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.75%       6.702us         0.94%       8.422us       2.807us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     334.363us      1661.27%     334.363us     334.363us             1  
+                                            torch_eager        14.65%     121.213us        99.39%     822.628us     822.628us       0.000us         0.00%      22.271us      22.271us             1  
+                                               aten::to         0.73%       6.022us        66.87%     553.441us      92.240us       0.000us         0.00%      14.239us       2.373us             6  
+                                         aten::_to_copy         2.76%      22.839us        66.14%     547.419us      91.237us       0.000us         0.00%      14.239us       2.373us             6  
+                                            aten::copy_         6.10%      50.480us        59.81%     495.040us      82.507us      12.095us        60.09%      14.239us       2.373us             6  
+                                           aten::conv1d         0.71%       5.911us        14.57%     120.603us      40.201us       0.000us         0.00%       8.032us       2.677us             3  
+                                      aten::convolution         1.15%       9.530us        13.86%     114.692us      38.231us       0.000us         0.00%       8.032us       2.677us             3  
+                                     aten::_convolution         2.90%      23.998us        12.71%     105.162us      35.054us       0.000us         0.00%       8.032us       2.677us             3  
+                                aten::_conv_depthwise2d         2.69%      22.281us         7.85%      64.952us      21.651us       8.032us        39.91%       8.032us       2.677us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us       8.032us        39.91%       8.032us       2.677us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       6.207us        30.84%       6.207us       2.069us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       5.888us        29.25%       5.888us       1.963us             3  
+                                Activity Buffer Request        27.45%     227.155us        27.45%     227.155us     227.155us       2.144us        10.65%       2.144us       2.144us             1  
+                                    aten::empty_strided         3.57%      29.540us         3.57%      29.540us       4.923us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        29.06%     240.556us        29.06%     240.556us      26.728us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.13%      17.627us         2.77%      22.910us       2.546us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.09%       9.014us         1.09%       9.014us       0.601us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.17%       9.710us         1.17%       9.710us       3.237us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.19%       9.810us         1.19%       9.810us       3.270us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       5.931us         0.90%       7.422us       2.474us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 893.171us
-Self CUDA time total: 20.096us
+Self CPU time total: 827.669us
+Self CUDA time total: 20.127us
 
 
 
@@ -4494,29 +4494,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     332.730us       926.72%     332.730us     332.730us             1  
-                                            torch_eager        14.27%     126.064us        99.42%     878.341us     878.341us       0.000us         0.00%      38.496us      38.496us             1  
-                                           aten::conv1d         0.64%       5.671us        13.39%     118.255us      39.418us       0.000us         0.00%      20.096us       6.699us             3  
-                                      aten::convolution         1.11%       9.840us        12.74%     112.584us      37.528us       0.000us         0.00%      20.096us       6.699us             3  
-                                     aten::_convolution         2.79%      24.681us        11.63%     102.744us      34.248us       0.000us         0.00%      20.096us       6.699us             3  
-                                aten::_conv_depthwise2d         2.42%      21.390us         7.02%      62.061us      20.687us      20.096us        55.97%      20.096us       6.699us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.096us        55.97%      20.096us       6.699us             3  
-                                               aten::to         0.72%       6.320us        68.61%     606.182us     101.030us       0.000us         0.00%      18.400us       3.067us             6  
-                                         aten::_to_copy         2.82%      24.900us        67.90%     599.862us      99.977us       0.000us         0.00%      18.400us       3.067us             6  
-                                            aten::copy_         5.62%      49.645us        61.77%     545.702us      90.950us      15.808us        44.03%      18.400us       3.067us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.448us        23.53%       8.448us       2.816us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.360us        20.50%       7.360us       2.453us             3  
-                                Activity Buffer Request        29.42%     259.919us        29.42%     259.919us     259.919us       2.592us         7.22%       2.592us       2.592us             1  
-                                    aten::empty_strided         3.31%      29.260us         3.31%      29.260us       4.877us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        29.15%     257.559us        29.15%     257.559us      28.618us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.02%      17.842us         2.68%      23.662us       2.629us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.05%       9.271us         1.05%       9.271us       0.618us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.19%      10.540us         1.19%      10.540us       3.513us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.99%       8.710us         0.99%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.65%       5.719us         0.80%       7.050us       2.350us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     335.875us       933.82%     335.875us     335.875us             1  
+                                            torch_eager        14.86%     122.213us        99.37%     816.978us     816.978us       0.000us         0.00%      38.560us      38.560us             1  
+                                           aten::conv1d         0.73%       6.020us        14.48%     119.072us      39.691us       0.000us         0.00%      20.064us       6.688us             3  
+                                      aten::convolution         1.17%       9.589us        13.75%     113.052us      37.684us       0.000us         0.00%      20.064us       6.688us             3  
+                                     aten::_convolution         2.85%      23.419us        12.58%     103.463us      34.488us       0.000us         0.00%      20.064us       6.688us             3  
+                                aten::_conv_depthwise2d         2.73%      22.441us         7.81%      64.191us      21.397us      20.064us        55.78%      20.064us       6.688us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      20.064us        55.78%      20.064us       6.688us             3  
+                                               aten::to         0.74%       6.089us        66.65%     547.971us      91.328us       0.000us         0.00%      18.496us       3.083us             6  
+                                         aten::_to_copy         2.81%      23.090us        65.91%     541.882us      90.314us       0.000us         0.00%      18.496us       3.083us             6  
+                                            aten::copy_         6.02%      49.484us        59.30%     487.602us      81.267us      15.904us        44.22%      18.496us       3.083us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.576us        23.84%       8.576us       2.859us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.328us        20.37%       7.328us       2.443us             3  
+                                Activity Buffer Request        27.35%     224.865us        27.35%     224.865us     224.865us       2.592us         7.21%       2.592us       2.592us             1  
+                                    aten::empty_strided         3.79%      31.190us         3.79%      31.190us       5.198us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        28.67%     235.693us        28.67%     235.693us      26.188us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.17%      17.820us         2.82%      23.212us       2.579us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.10%       9.012us         1.10%       9.012us       0.601us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.17%       9.620us         1.17%       9.620us       3.207us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.18%       9.690us         1.18%       9.690us       3.230us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.68%       5.572us         0.85%       6.952us       2.317us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 883.481us
-Self CUDA time total: 35.904us
+Self CPU time total: 822.198us
+Self CUDA time total: 35.968us
 
 
 
@@ -4526,29 +4526,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     337.888us       888.80%     337.888us     337.888us             1  
-                                            torch_eager         6.31%     128.615us        99.74%       2.033ms       2.033ms       0.000us         0.00%      40.576us      40.576us             1  
-                                           aten::conv1d         0.31%       6.349us         5.98%     121.885us      40.628us       0.000us         0.00%      22.304us       7.435us             3  
-                                      aten::convolution         0.53%      10.852us         5.67%     115.536us      38.512us       0.000us         0.00%      22.304us       7.435us             3  
-                                     aten::_convolution         1.24%      25.291us         5.14%     104.684us      34.895us       0.000us         0.00%      22.304us       7.435us             3  
-                                aten::_conv_depthwise2d         1.08%      22.031us         3.01%      61.431us      20.477us      22.304us        58.67%      22.304us       7.435us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.304us        58.67%      22.304us       7.435us             3  
-                                               aten::to         0.34%       6.829us        86.09%       1.755ms     292.477us       0.000us         0.00%      18.272us       3.045us             6  
-                                         aten::_to_copy         1.20%      24.424us        85.75%       1.748ms     291.339us       0.000us         0.00%      18.272us       3.045us             6  
-                                            aten::copy_         2.48%      50.501us        83.10%       1.694ms     282.331us      15.712us        41.33%      18.272us       3.045us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.320us        21.89%       8.320us       2.773us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.392us        19.44%       7.392us       2.464us             3  
-                                Activity Buffer Request        69.75%       1.422ms        69.75%       1.422ms       1.422ms       2.560us         6.73%       2.560us       2.560us             1  
-                                    aten::empty_strided         1.45%      29.621us         1.45%      29.621us       4.937us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        11.90%     242.506us        11.90%     242.506us      26.945us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.92%      18.701us         1.17%      23.851us       2.650us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.43%       8.710us         0.43%       8.710us       0.581us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.48%       9.800us         0.48%       9.800us       3.267us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.43%       8.710us         0.43%       8.710us       2.903us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.35%       7.191us         0.42%       8.621us       2.874us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     370.623us       978.21%     370.623us     370.623us             1  
+                                            torch_eager         6.18%     128.993us        99.74%       2.082ms       2.082ms       0.000us         0.00%      40.448us      40.448us             1  
+                                           aten::conv1d         0.30%       6.311us         5.92%     123.493us      41.164us       0.000us         0.00%      22.177us       7.392us             3  
+                                      aten::convolution         0.50%      10.340us         5.61%     117.182us      39.061us       0.000us         0.00%      22.177us       7.392us             3  
+                                     aten::_convolution         1.15%      24.110us         5.12%     106.842us      35.614us       0.000us         0.00%      22.177us       7.392us             3  
+                                aten::_conv_depthwise2d         1.14%      23.742us         3.15%      65.742us      21.914us      22.177us        58.53%      22.177us       7.392us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      22.177us        58.53%      22.177us       7.392us             3  
+                                               aten::to         1.13%      23.681us        86.31%       1.802ms     300.273us       0.000us         0.00%      18.271us       3.045us             6  
+                                         aten::_to_copy         1.20%      24.951us        85.17%       1.778ms     296.326us       0.000us         0.00%      18.271us       3.045us             6  
+                                            aten::copy_         2.41%      50.250us        82.40%       1.720ms     286.684us      15.711us        41.47%      18.271us       3.045us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       8.321us        21.96%       8.321us       2.774us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us       7.390us        19.50%       7.390us       2.463us             3  
+                                Activity Buffer Request        69.33%       1.447ms        69.33%       1.447ms       1.447ms       2.560us         6.76%       2.560us       2.560us             1  
+                                    aten::empty_strided         1.58%      32.901us         1.58%      32.901us       5.484us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        11.73%     244.945us        11.73%     244.945us      27.216us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.87%      18.191us         1.14%      23.770us       2.641us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.44%       9.210us         0.44%       9.210us       0.614us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.48%       9.930us         0.48%       9.930us       3.310us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.46%       9.680us         0.46%       9.680us       3.227us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.32%       6.640us         0.38%       7.960us       2.653us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 2.038ms
-Self CUDA time total: 38.016us
+Self CPU time total: 2.088ms
+Self CUDA time total: 37.888us
 
 
 
@@ -4558,29 +4558,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     362.972us       567.16%     362.972us     362.972us             1  
-                                            torch_eager        14.84%     128.544us        99.34%     860.680us     860.680us       0.000us         0.00%      68.061us      68.061us             1  
-                                           aten::conv1d         0.70%       6.079us        16.52%     143.165us      47.722us       0.000us         0.00%      41.728us      13.909us             3  
-                                      aten::convolution         3.42%      29.613us        15.82%     137.086us      45.695us       0.000us         0.00%      41.728us      13.909us             3  
-                                     aten::_convolution         2.86%      24.759us        12.40%     107.473us      35.824us       0.000us         0.00%      41.728us      13.909us             3  
-                                aten::_conv_depthwise2d         2.59%      22.439us         7.67%      66.492us      22.164us      41.728us        65.20%      41.728us      13.909us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.728us        65.20%      41.728us      13.909us             3  
-                                               aten::to         0.77%       6.631us        64.71%     560.621us      93.437us       0.000us         0.00%      26.333us       4.389us             6  
-                                         aten::_to_copy         2.80%      24.253us        63.94%     553.990us      92.332us       0.000us         0.00%      26.333us       4.389us             6  
-                                            aten::copy_         5.80%      50.240us        57.50%     498.196us      83.033us      22.270us        34.80%      26.333us       4.389us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.903us        18.60%      11.903us       3.968us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.367us        16.20%      10.367us       3.456us             3  
-                                Activity Buffer Request        26.05%     225.728us        26.05%     225.728us     225.728us       4.063us         6.35%       4.063us       4.063us             1  
-                                    aten::empty_strided         3.64%      31.541us         3.64%      31.541us       5.257us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        28.31%     245.279us        28.31%     245.279us      27.253us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.11%      18.263us         2.74%      23.752us       2.639us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.06%       9.199us         1.06%       9.199us       0.613us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.26%      10.941us         1.26%      10.941us       3.647us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.16%      10.061us         1.16%      10.061us       3.354us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.66%       5.740us         0.85%       7.330us       2.443us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     339.324us       532.86%     339.324us     339.324us             1  
+                                            torch_eager        14.96%     124.364us        99.38%     826.288us     826.288us       0.000us         0.00%      67.776us      67.776us             1  
+                                           aten::conv1d         0.74%       6.121us        14.48%     120.383us      40.128us       0.000us         0.00%      41.409us      13.803us             3  
+                                      aten::convolution         1.31%      10.850us        13.74%     114.262us      38.087us       0.000us         0.00%      41.409us      13.803us             3  
+                                     aten::_convolution         2.80%      23.271us        12.44%     103.412us      34.471us       0.000us         0.00%      41.409us      13.803us             3  
+                                aten::_conv_depthwise2d         2.86%      23.801us         7.77%      64.610us      21.537us      41.409us        65.03%      41.409us      13.803us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      41.409us        65.03%      41.409us      13.803us             3  
+                                               aten::to         0.72%       5.961us        66.69%     554.441us      92.407us       0.000us         0.00%      26.367us       4.395us             6  
+                                         aten::_to_copy         2.84%      23.608us        65.97%     548.480us      91.413us       0.000us         0.00%      26.367us       4.395us             6  
+                                            aten::copy_         6.26%      52.010us        59.41%     493.909us      82.318us      22.271us        34.97%      26.367us       4.395us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.935us        18.74%      11.935us       3.978us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.336us        16.23%      10.336us       3.445us             3  
+                                Activity Buffer Request        27.47%     228.425us        27.47%     228.425us     228.425us       4.096us         6.43%       4.096us       4.096us             1  
+                                    aten::empty_strided         3.72%      30.963us         3.72%      30.963us       5.160us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        28.31%     235.354us        28.31%     235.354us      26.150us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.06%      17.130us         2.64%      21.981us       2.442us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.00%       8.352us         1.00%       8.352us       0.557us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.18%       9.829us         1.18%       9.829us       3.276us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.09%       9.100us         1.09%       9.100us       3.033us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.71%       5.910us         0.88%       7.280us       2.427us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 866.380us
-Self CUDA time total: 63.998us
+Self CPU time total: 831.408us
+Self CUDA time total: 63.680us
 
 
 
@@ -4590,29 +4590,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     357.311us       512.91%     357.311us     357.311us             1  
-                                            torch_eager        20.96%     191.619us        99.38%     908.662us     908.662us       0.000us         0.00%      73.696us      73.696us             1  
-                                           aten::conv1d         0.63%       5.760us        15.23%     139.294us      46.431us       0.000us         0.00%      47.296us      15.765us             3  
-                                      aten::convolution         2.87%      26.271us        14.60%     133.534us      44.511us       0.000us         0.00%      47.296us      15.765us             3  
-                                     aten::_convolution         2.77%      25.360us        11.73%     107.263us      35.754us       0.000us         0.00%      47.296us      15.765us             3  
-                                aten::_conv_depthwise2d         2.38%      21.722us         7.17%      65.523us      21.841us      47.296us        67.89%      47.296us      15.765us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.296us        67.89%      47.296us      15.765us             3  
-                                               aten::to         0.73%       6.650us        60.08%     549.318us      91.553us       0.000us         0.00%      26.400us       4.400us             6  
-                                         aten::_to_copy         2.63%      24.032us        59.35%     542.668us      90.445us       0.000us         0.00%      26.400us       4.400us             6  
-                                            aten::copy_         5.57%      50.922us        53.46%     488.786us      81.464us      22.368us        32.11%      26.400us       4.400us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.872us        17.04%      11.872us       3.957us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us        15.07%      10.496us       3.499us             3  
-                                Activity Buffer Request        23.91%     218.617us        23.91%     218.617us     218.617us       4.032us         5.79%       4.032us       4.032us             1  
-                                    aten::empty_strided         3.26%      29.850us         3.26%      29.850us       4.975us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        26.57%     242.937us        26.57%     242.937us      26.993us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.04%      18.652us         2.65%      24.251us       2.695us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.01%       9.230us         1.01%       9.230us       0.615us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.08%       9.870us         1.08%       9.870us       3.290us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.12%      10.241us         1.12%      10.241us       3.414us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.63%       5.780us         0.80%       7.270us       2.423us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     341.884us       492.57%     341.884us     341.884us             1  
+                                            torch_eager        14.66%     124.263us        99.38%     842.608us     842.608us       0.000us         0.00%      73.472us      73.472us             1  
+                                           aten::conv1d         0.69%       5.810us        14.06%     119.183us      39.728us       0.000us         0.00%      47.072us      15.691us             3  
+                                      aten::convolution         1.10%       9.331us        13.37%     113.373us      37.791us       0.000us         0.00%      47.072us      15.691us             3  
+                                     aten::_convolution         2.98%      25.231us        12.27%     104.042us      34.681us       0.000us         0.00%      47.072us      15.691us             3  
+                                aten::_conv_depthwise2d         2.57%      21.770us         7.47%      63.341us      21.114us      47.072us        67.82%      47.072us      15.691us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us      47.072us        67.82%      47.072us      15.691us             3  
+                                               aten::to         0.71%       6.042us        67.35%     571.062us      95.177us       0.000us         0.00%      26.400us       4.400us             6  
+                                         aten::_to_copy         2.91%      24.658us        66.64%     565.020us      94.170us       0.000us         0.00%      26.400us       4.400us             6  
+                                            aten::copy_         5.98%      50.742us        60.23%     510.651us      85.108us      22.336us        32.18%      26.400us       4.400us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      11.967us        17.24%      11.967us       3.989us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      10.369us        14.94%      10.369us       3.456us             3  
+                                Activity Buffer Request        28.69%     243.255us        28.69%     243.255us     243.255us       4.064us         5.86%       4.064us       4.064us             1  
+                                    aten::empty_strided         3.50%      29.711us         3.50%      29.711us       4.952us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        28.24%     239.475us        28.24%     239.475us      26.608us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.11%      17.861us         2.71%      22.969us       2.552us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.01%       8.598us         1.01%       8.598us       0.573us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.13%       9.580us         1.13%       9.580us       3.193us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.08%       9.170us         1.08%       9.170us       3.057us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.70%       5.911us         0.85%       7.210us       2.403us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 914.323us
-Self CUDA time total: 69.664us
+Self CPU time total: 847.859us
+Self CUDA time total: 69.408us
 
 
 
@@ -4622,29 +4622,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.092us       187.26%     348.092us     348.092us             1  
-                                            torch_eager        14.76%     124.374us        99.29%     836.558us     836.558us       0.000us         0.00%     195.870us     195.870us             1  
-                                           aten::conv1d         0.70%       5.900us        14.42%     121.504us      40.501us       0.000us         0.00%     133.406us      44.469us             3  
-                                      aten::convolution         1.14%       9.610us        13.72%     115.604us      38.535us       0.000us         0.00%     133.406us      44.469us             3  
-                                     aten::_convolution         2.88%      24.263us        12.58%     105.994us      35.331us       0.000us         0.00%     133.406us      44.469us             3  
-                                aten::_conv_depthwise2d         2.73%      23.010us         7.80%      65.750us      21.917us     133.406us        71.77%     133.406us      44.469us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     133.406us        71.77%     133.406us      44.469us             3  
-                                               aten::to         0.74%       6.220us        66.83%     563.060us      93.843us       0.000us         0.00%      62.464us      10.411us             6  
-                                         aten::_to_copy         2.83%      23.861us        66.09%     556.840us      92.807us       0.000us         0.00%      62.464us      10.411us             6  
-                                            aten::copy_         6.03%      50.810us        59.73%     503.287us      83.881us      52.480us        28.23%      62.464us      10.411us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.600us        15.92%      29.600us       9.867us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.880us        12.31%      22.880us       7.627us             3  
-                                Activity Buffer Request        25.69%     216.468us        25.69%     216.468us     216.468us       9.984us         5.37%       9.984us       9.984us             1  
-                                    aten::empty_strided         3.52%      29.692us         3.52%      29.692us       4.949us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        30.59%     257.739us        30.59%     257.739us      28.638us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.08%      17.540us         2.73%      23.000us       2.556us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.12%       9.412us         1.12%       9.412us       0.627us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.20%      10.110us         1.20%      10.110us       3.370us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.29%      10.900us         1.29%      10.900us       3.633us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.68%       5.719us         0.88%       7.451us       2.484us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     350.870us       189.71%     350.870us     350.870us             1  
+                                            torch_eager        14.64%     124.941us        99.33%     847.778us     847.778us       0.000us         0.00%     194.907us     194.907us             1  
+                                           aten::conv1d         0.69%       5.931us        14.37%     122.643us      40.881us       0.000us         0.00%     132.732us      44.244us             3  
+                                      aten::convolution         1.16%       9.910us        13.67%     116.712us      38.904us       0.000us         0.00%     132.732us      44.244us             3  
+                                     aten::_convolution         2.94%      25.098us        12.51%     106.802us      35.601us       0.000us         0.00%     132.732us      44.244us             3  
+                                aten::_conv_depthwise2d         2.63%      22.470us         7.66%      65.342us      21.781us     132.732us        71.76%     132.732us      44.244us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     132.732us        71.76%     132.732us      44.244us             3  
+                                               aten::to         0.71%       6.042us        67.13%     572.943us      95.490us       0.000us         0.00%      62.175us      10.362us             6  
+                                         aten::_to_copy         2.75%      23.470us        66.42%     566.901us      94.484us       0.000us         0.00%      62.175us      10.362us             6  
+                                            aten::copy_         6.00%      51.182us        60.05%     512.571us      85.428us      52.223us        28.24%      62.175us      10.362us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      29.343us        15.86%      29.343us       9.781us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.880us        12.37%      22.880us       7.627us             3  
+                                Activity Buffer Request        29.33%     250.295us        29.33%     250.295us     250.295us       9.952us         5.38%       9.952us       9.952us             1  
+                                    aten::empty_strided         3.62%      30.860us         3.62%      30.860us       5.143us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        27.39%     233.736us        27.39%     233.736us      25.971us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.08%      17.752us         2.70%      23.071us       2.563us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.06%       9.008us         1.06%       9.008us       0.601us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.27%      10.820us         1.27%      10.820us       3.607us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.10%       9.410us         1.10%       9.410us       3.137us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.72%       6.112us         0.88%       7.511us       2.504us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 842.539us
-Self CUDA time total: 185.886us
+Self CPU time total: 853.509us
+Self CUDA time total: 184.955us
 
 
 
@@ -4654,29 +4654,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     348.403us       166.18%     348.403us     348.403us             1  
-                                            torch_eager        14.60%     122.924us        99.33%     836.209us     836.209us       0.000us         0.00%     223.383us     223.383us             1  
-                                           aten::conv1d         0.69%       5.779us        14.01%     117.955us      39.318us       0.000us         0.00%     153.883us      51.294us             3  
-                                      aten::convolution         1.25%      10.491us        13.32%     112.176us      37.392us       0.000us         0.00%     153.883us      51.294us             3  
-                                     aten::_convolution         2.91%      24.484us        12.08%     101.685us      33.895us       0.000us         0.00%     153.883us      51.294us             3  
-                                aten::_conv_depthwise2d         2.49%      20.928us         7.14%      60.070us      20.023us     153.883us        73.40%     153.883us      51.294us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     153.883us        73.40%     153.883us      51.294us             3  
-                                               aten::to         0.73%       6.179us        67.37%     567.200us      94.533us       0.000us         0.00%      69.500us      11.583us             6  
-                                         aten::_to_copy         2.75%      23.132us        66.64%     561.021us      93.504us       0.000us         0.00%      69.500us      11.583us             6  
-                                            aten::copy_         5.91%      49.740us        60.39%     508.377us      84.729us      55.773us        26.60%      69.500us      11.583us             6  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      32.927us        15.71%      32.927us      10.976us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.846us        10.90%      22.846us       7.615us             3  
-                                Activity Buffer Request        29.09%     244.869us        29.09%     244.869us     244.869us      13.727us         6.55%      13.727us      13.727us             1  
-                                    aten::empty_strided         3.51%      29.512us         3.51%      29.512us       4.919us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        27.84%     234.420us        27.84%     234.420us      26.047us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         2.13%      17.973us         2.77%      23.320us       2.591us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         1.09%       9.167us         1.09%       9.167us       0.611us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         1.12%       9.440us         1.12%       9.440us       3.147us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         1.07%       9.050us         1.07%       9.050us       3.017us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.85%       7.121us         1.02%       8.601us       2.867us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     355.227us       169.45%     355.227us     355.227us             1  
+                                            torch_eager        15.18%     123.692us        99.29%     808.918us     808.918us       0.000us         0.00%     223.518us     223.518us             1  
+                                           aten::conv1d         0.72%       5.860us        14.71%     119.853us      39.951us       0.000us         0.00%     153.470us      51.157us             3  
+                                      aten::convolution         1.17%       9.541us        13.99%     113.993us      37.998us       0.000us         0.00%     153.470us      51.157us             3  
+                                     aten::_convolution         3.03%      24.710us        12.82%     104.452us      34.817us       0.000us         0.00%     153.470us      51.157us             3  
+                                aten::_conv_depthwise2d         2.76%      22.461us         7.85%      63.951us      21.317us     153.470us        73.21%     153.470us      51.157us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     153.470us        73.21%     153.470us      51.157us             3  
+                                               aten::to         0.75%       6.140us        65.95%     537.281us      89.547us       0.000us         0.00%      70.048us      11.675us             6  
+                                         aten::_to_copy         2.84%      23.150us        65.19%     531.141us      88.524us       0.000us         0.00%      70.048us      11.675us             6  
+                                            aten::copy_         6.47%      52.731us        58.48%     476.471us      79.412us      56.160us        26.79%      70.048us      11.675us             6  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us      33.184us        15.83%      33.184us      11.061us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.976us        10.96%      22.976us       7.659us             3  
+                                Activity Buffer Request        26.55%     216.325us        26.55%     216.325us     216.325us      13.888us         6.63%      13.888us      13.888us             1  
+                                    aten::empty_strided         3.87%      31.520us         3.87%      31.520us       5.253us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        28.13%     229.215us        28.13%     229.215us      25.468us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         2.20%      17.931us         2.85%      23.181us       2.576us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         1.08%       8.800us         1.08%       8.800us       0.587us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         1.20%       9.790us         1.20%       9.790us       3.263us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         1.22%       9.900us         1.22%       9.900us       3.300us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.73%       5.980us         0.90%       7.360us       2.453us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 841.880us
-Self CUDA time total: 209.656us
+Self CPU time total: 814.738us
+Self CUDA time total: 209.630us
 
 
 
@@ -4686,29 +4686,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         7.22%     135.785us        57.39%       1.079ms       1.079ms       0.000us         0.00%       1.518ms       1.518ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.419ms       100.41%       1.419ms       1.419ms             1  
-                                               aten::to         0.37%       6.901us        40.86%     768.526us     128.088us       0.000us         0.00%     823.221us     137.204us             6  
-                                         aten::_to_copy         1.63%      30.742us        40.49%     761.625us     126.938us       0.000us         0.00%     823.221us     137.204us             6  
-                                            aten::copy_         2.94%      55.302us        27.81%     523.157us      87.193us     717.942us        50.81%     823.221us     137.204us             6  
-                                           aten::conv1d         0.33%       6.280us         6.71%     126.144us      42.048us       0.000us         0.00%     695.094us     231.698us             3  
-                                      aten::convolution         0.57%      10.750us         6.37%     119.864us      39.955us       0.000us         0.00%     695.094us     231.698us             3  
-                                     aten::_convolution         1.35%      25.400us         5.80%     109.114us      36.371us       0.000us         0.00%     695.094us     231.698us             3  
-                                aten::_conv_depthwise2d         1.19%      22.332us         3.55%      66.763us      22.254us     695.094us        49.19%     695.094us     231.698us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     695.094us        49.19%     695.094us     231.698us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     411.706us        29.14%     411.706us     137.235us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     306.236us        21.67%     306.236us     102.079us             3  
-                                Activity Buffer Request        12.99%     244.238us        12.99%     244.238us     244.238us     105.279us         7.45%     105.279us     105.279us             1  
-                                    aten::empty_strided         2.17%      40.811us        11.04%     207.726us      34.621us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel        13.13%     246.997us        13.13%     246.997us      27.444us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         1.97%      37.133us         2.36%      44.413us       4.935us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.58%      10.889us         0.58%      10.889us       0.726us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.53%      10.051us         0.53%      10.051us       3.350us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.58%      11.000us         0.58%      11.000us       3.667us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.34%       6.350us         0.41%       7.700us       2.567us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         6.92%     128.362us        54.16%       1.005ms       1.005ms       0.000us         0.00%       1.522ms       1.522ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.421ms       100.42%       1.421ms       1.421ms             1  
+                                               aten::to         0.36%       6.621us        38.84%     720.727us     120.121us       0.000us         0.00%     826.557us     137.760us             6  
+                                         aten::_to_copy         1.58%      29.231us        38.49%     714.106us     119.018us       0.000us         0.00%     826.557us     137.760us             6  
+                                            aten::copy_         2.91%      54.020us        26.66%     494.611us      82.435us     719.869us        50.86%     826.557us     137.760us             6  
+                                           aten::conv1d         0.33%       6.200us         6.83%     126.803us      42.268us       0.000us         0.00%     695.450us     231.817us             3  
+                                      aten::convolution         0.54%      10.000us         6.50%     120.603us      40.201us       0.000us         0.00%     695.450us     231.817us             3  
+                                     aten::_convolution         1.37%      25.370us         5.96%     110.603us      36.868us       0.000us         0.00%     695.450us     231.817us             3  
+                                aten::_conv_depthwise2d         1.27%      23.500us         3.66%      67.942us      22.647us     695.450us        49.14%     695.450us     231.817us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     695.450us        49.14%     695.450us     231.817us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     408.829us        28.89%     408.829us     136.276us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     311.040us        21.98%     311.040us     103.680us             3  
+                                Activity Buffer Request        12.21%     226.485us        12.21%     226.485us     226.485us     106.688us         7.54%     106.688us     106.688us             1  
+                                    aten::empty_strided         2.00%      37.161us        10.25%     190.264us      31.711us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel        12.87%     238.737us        12.87%     238.737us      26.526us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.97%      18.050us         1.30%      24.121us       2.680us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.54%       9.951us         0.54%       9.951us       0.663us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.54%      10.110us         0.54%      10.110us       3.370us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.52%       9.701us         0.52%       9.701us       3.234us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.37%       6.860us         0.45%       8.350us       2.783us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 1.881ms
-Self CUDA time total: 1.413ms
+Self CPU time total: 1.855ms
+Self CUDA time total: 1.415ms
 
 
 
@@ -4718,61 +4718,61 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-                                            torch_eager         4.25%     132.984us        66.63%       2.083ms       2.083ms       0.000us         0.00%       1.503ms       1.503ms             1  
-                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.434ms       100.41%       1.434ms       1.434ms             1  
-                                               aten::to         0.21%       6.470us        57.53%       1.798ms     299.656us       0.000us         0.00%     765.147us     127.524us             6  
-                                         aten::_to_copy         0.80%      25.009us        57.32%       1.791ms     298.577us       0.000us         0.00%     765.147us     127.524us             6  
-                                            aten::copy_         1.51%      47.155us        55.55%       1.736ms     289.360us     690.492us        48.35%     765.147us     127.524us             6  
-                                           aten::conv1d         0.20%       6.231us         3.91%     122.325us      40.775us       0.000us         0.00%     737.724us     245.908us             3  
-                                      aten::convolution         0.32%       9.920us         3.71%     116.094us      38.698us       0.000us         0.00%     737.724us     245.908us             3  
-                                     aten::_convolution         0.82%      25.623us         3.40%     106.174us      35.391us       0.000us         0.00%     737.724us     245.908us             3  
-                                aten::_conv_depthwise2d         0.70%      21.899us         1.98%      62.011us      20.670us     737.724us        51.65%     737.724us     245.908us             3  
-void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     737.724us        51.65%     737.724us     245.908us             3  
-void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     398.046us        27.87%     398.046us     132.682us             3  
-void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     292.446us        20.48%     292.446us      97.482us             3  
-                                Activity Buffer Request        47.19%       1.475ms        47.19%       1.475ms       1.475ms      74.655us         5.23%      74.655us      74.655us             1  
-                                    aten::empty_strided         0.97%      30.293us         0.97%      30.293us       5.049us       0.000us         0.00%       0.000us       0.000us             6  
-                                       cudaLaunchKernel         7.52%     235.026us         7.52%     235.026us      26.114us       0.000us         0.00%       0.000us       0.000us             9  
-                                        aten::unsqueeze         0.60%      18.740us         0.79%      24.820us       2.758us       0.000us         0.00%       0.000us       0.000us             9  
-                                       aten::as_strided         0.32%      10.019us         0.32%      10.019us       0.668us       0.000us         0.00%       0.000us       0.000us            15  
-                                            aten::empty         0.32%       9.882us         0.32%       9.882us       3.294us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::resize_         0.29%       9.220us         0.29%       9.220us       3.073us       0.000us         0.00%       0.000us       0.000us             3  
-                                          aten::squeeze         0.24%       7.471us         0.29%       9.160us       3.053us       0.000us         0.00%       0.000us       0.000us             3  
+                                            torch_eager         4.16%     128.483us        66.51%       2.056ms       2.056ms       0.000us         0.00%       1.499ms       1.499ms             1  
+                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.430ms       100.42%       1.430ms       1.430ms             1  
+                                               aten::to         0.21%       6.492us        57.40%       1.775ms     295.822us       0.000us         0.00%     760.863us     126.811us             6  
+                                         aten::_to_copy         0.82%      25.449us        57.19%       1.768ms     294.739us       0.000us         0.00%     760.863us     126.811us             6  
+                                            aten::copy_         1.66%      51.471us        55.36%       1.712ms     285.278us     686.079us        48.17%     760.863us     126.811us             6  
+                                           aten::conv1d         0.22%       6.820us         4.02%     124.423us      41.474us       0.000us         0.00%     738.336us     246.112us             3  
+                                      aten::convolution         0.33%      10.111us         3.80%     117.603us      39.201us       0.000us         0.00%     738.336us     246.112us             3  
+                                     aten::_convolution         0.82%      25.320us         3.48%     107.492us      35.831us       0.000us         0.00%     738.336us     246.112us             3  
+                                aten::_conv_depthwise2d         0.75%      23.320us         2.10%      65.022us      21.674us     738.336us        51.83%     738.336us     246.112us             3  
+void at::native::(anonymous namespace)::conv_depthwi...         0.00%       0.000us         0.00%       0.000us       0.000us     738.336us        51.83%     738.336us     246.112us             3  
+void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     395.071us        27.74%     395.071us     131.690us             3  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     291.008us        20.43%     291.008us      97.003us             3  
+                                Activity Buffer Request        46.92%       1.451ms        46.92%       1.451ms       1.451ms      74.784us         5.25%      74.784us      74.784us             1  
+                                    aten::empty_strided         1.01%      31.321us         1.01%      31.321us       5.220us       0.000us         0.00%       0.000us       0.000us             6  
+                                       cudaLaunchKernel         7.49%     231.634us         7.49%     231.634us      25.737us       0.000us         0.00%       0.000us       0.000us             9  
+                                        aten::unsqueeze         0.61%      18.861us         0.79%      24.350us       2.706us       0.000us         0.00%       0.000us       0.000us             9  
+                                       aten::as_strided         0.29%       9.099us         0.29%       9.099us       0.607us       0.000us         0.00%       0.000us       0.000us            15  
+                                            aten::empty         0.32%       9.981us         0.32%       9.981us       3.327us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::resize_         0.31%       9.461us         0.31%       9.461us       3.154us       0.000us         0.00%       0.000us       0.000us             3  
+                                          aten::squeeze         0.20%       6.260us         0.25%       7.650us       2.550us       0.000us         0.00%       0.000us       0.000us             3  
 -------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
-Self CPU time total: 3.125ms
-Self CUDA time total: 1.428ms
+Self CPU time total: 3.092ms
+Self CUDA time total: 1.424ms
 
 
 impl                     wl                  p50(ms)  ok
-torch_eager              cuda_B2_D2048_S128_W2     0.08  True
+torch_eager              cuda_B2_D2048_S128_W2     0.09  True
 torch_eager              cuda_B2_D2048_S128_W4     0.09  True
-torch_eager              cuda_B2_D2048_S2048_W2     0.15  True
+torch_eager              cuda_B2_D2048_S2048_W2     0.14  True
 torch_eager              cuda_B2_D2048_S2048_W4     0.16  True
-torch_eager              cuda_B2_D2048_S512_W2     0.08  True
-torch_eager              cuda_B2_D2048_S512_W4     0.08  True
+torch_eager              cuda_B2_D2048_S512_W2     0.09  True
+torch_eager              cuda_B2_D2048_S512_W4     0.09  True
 torch_eager              cuda_B2_D64_S128_W2     0.07  True
 torch_eager              cuda_B2_D64_S128_W4     0.09  True
 torch_eager              cuda_B2_D64_S2048_W2     0.09  True
-torch_eager              cuda_B2_D64_S2048_W4     0.08  True
+torch_eager              cuda_B2_D64_S2048_W4     0.09  True
 torch_eager              cuda_B2_D64_S512_W2     0.09  True
 torch_eager              cuda_B2_D64_S512_W4     0.09  True
 torch_eager              cuda_B4_D2048_S128_W2     0.09  True
-torch_eager              cuda_B4_D2048_S128_W4     0.08  True
+torch_eager              cuda_B4_D2048_S128_W4     0.09  True
 torch_eager              cuda_B4_D2048_S2048_W2     0.49  True
 torch_eager              cuda_B4_D2048_S2048_W4     0.50  True
-torch_eager              cuda_B4_D2048_S512_W2     0.09  True
+torch_eager              cuda_B4_D2048_S512_W2     0.10  True
 torch_eager              cuda_B4_D2048_S512_W4     0.10  True
-torch_eager              cuda_B4_D64_S128_W2     0.08  True
-torch_eager              cuda_B4_D64_S128_W4     0.08  True
-torch_eager              cuda_B4_D64_S2048_W2     0.08  True
+torch_eager              cuda_B4_D64_S128_W2     0.09  True
+torch_eager              cuda_B4_D64_S128_W4     0.09  True
+torch_eager              cuda_B4_D64_S2048_W2     0.09  True
 torch_eager              cuda_B4_D64_S2048_W4     0.09  True
-torch_eager              cuda_B4_D64_S512_W2     0.08  True
-torch_eager              cuda_B4_D64_S512_W4     0.08  True
+torch_eager              cuda_B4_D64_S512_W2     0.09  True
+torch_eager              cuda_B4_D64_S512_W4     0.09  True
 
▶ UV Install Logs