diff --git "a/causal_conv1d/impls/torch_causal_conv1d.html" "b/causal_conv1d/impls/torch_causal_conv1d.html" --- "a/causal_conv1d/impls/torch_causal_conv1d.html" +++ "b/causal_conv1d/impls/torch_causal_conv1d.html" @@ -3871,7 +3871,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: ▼ output ▶ uv-logs | -Cell: nv | 0.23s +Cell: nv | 0.25s | Raw @@ -3887,7 +3887,7 @@ Cell: nv | 0.23s
Wed Oct 29 00:36:08 2025 +Wed Oct 29 04:14:16 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | |-----------------------------------------+------------------------+----------------------+ @@ -3896,7 +3896,7 @@ Cell: nv | 0.23s | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | -| N/A 30C P0 87W / 350W | 0MiB / 46068MiB | 18% Default | +| N/A 35C P0 121W / 350W | 0MiB / 46068MiB | 100% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ @@ -3920,7 +3920,7 @@ Cell: nv | 0.23s ▼ output ▶ uv-logs | -Cell: benchmark | 7.30s +Cell: benchmark | 7.31s | Raw @@ -3982,29 +3982,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 460.509us 2386.43% 460.509us 460.509us 1 - torch_eager 10.46% 229.787us 99.65% 2.189ms 2.189ms 0.000us 0.00% 21.633us 21.633us 1 - aten::to 0.59% 12.913us 79.38% 1.743ms 290.578us 0.000us 0.00% 14.272us 2.379us 6 - aten::_to_copy 1.99% 43.750us 78.79% 1.731ms 288.426us 0.000us 0.00% 14.272us 2.379us 6 - aten::copy_ 2.89% 63.562us 74.16% 1.629ms 271.469us 11.936us 61.85% 14.272us 2.379us 6 - aten::conv1d 0.44% 9.671us 7.66% 168.306us 56.102us 0.000us 0.00% 7.361us 2.454us 3 - aten::convolution 0.72% 15.890us 7.22% 158.635us 52.878us 0.000us 0.00% 7.361us 2.454us 3 - aten::_convolution 1.69% 37.102us 6.50% 142.745us 47.582us 0.000us 0.00% 7.361us 2.454us 3 - aten::_conv_depthwise2d 1.60% 35.230us 3.77% 82.773us 27.591us 7.361us 38.15% 7.361us 2.454us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.361us 38.15% 7.361us 2.454us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.50% 6.272us 2.091us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.35% 5.664us 1.888us 3 - Activity Buffer Request 68.26% 1.499ms 68.26% 1.499ms 1.499ms 2.336us 12.11% 2.336us 2.336us 1 - aten::empty_strided 2.64% 57.992us 2.64% 57.992us 9.665us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 4.12% 90.443us 4.12% 90.443us 10.049us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.47% 32.392us 1.88% 41.212us 4.579us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.64% 14.011us 0.64% 14.011us 0.934us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.55% 12.120us 0.55% 12.120us 4.040us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.50% 10.961us 0.50% 10.961us 3.654us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.43% 9.410us 0.51% 11.220us 3.740us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 467.679us 2403.78% 467.679us 467.679us 1 + torch_eager 10.81% 233.756us 99.62% 2.155ms 2.155ms 0.000us 0.00% 21.792us 21.792us 1 + aten::to 0.55% 11.919us 78.64% 1.701ms 283.539us 0.000us 0.00% 14.304us 2.384us 6 + aten::_to_copy 2.04% 44.223us 78.09% 1.689ms 281.553us 0.000us 0.00% 14.304us 2.384us 6 + aten::copy_ 3.07% 66.360us 73.27% 1.585ms 264.169us 11.968us 61.51% 14.304us 2.384us 6 + aten::conv1d 0.40% 8.600us 7.96% 172.134us 57.378us 0.000us 0.00% 7.488us 2.496us 3 + aten::convolution 0.76% 16.533us 7.56% 163.534us 54.511us 0.000us 0.00% 7.488us 2.496us 3 + aten::_convolution 1.65% 35.660us 6.80% 147.001us 49.000us 0.000us 0.00% 7.488us 2.496us 3 + aten::_conv_depthwise2d 1.78% 38.520us 4.15% 89.871us 29.957us 7.488us 38.49% 7.488us 2.496us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.488us 38.49% 7.488us 2.496us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.24% 6.272us 2.091us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 29.28% 5.696us 1.899us 3 + Activity Buffer Request 67.06% 1.451ms 67.06% 1.451ms 1.451ms 2.336us 12.01% 2.336us 2.336us 1 + aten::empty_strided 2.78% 60.080us 2.78% 60.080us 10.013us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 4.39% 95.004us 4.39% 95.004us 10.556us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.49% 32.209us 1.86% 40.319us 4.480us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.61% 13.180us 0.61% 13.180us 0.879us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.57% 12.310us 0.57% 12.310us 4.103us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.56% 12.130us 0.56% 12.130us 4.043us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.40% 8.601us 0.48% 10.281us 3.427us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.196ms -Self CUDA time total: 19.297us +Self CPU time total: 2.163ms +Self CUDA time total: 19.456us @@ -4014,29 +4014,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.557us 1795.89% 350.557us 350.557us 1 - torch_eager 6.82% 130.236us 99.71% 1.905ms 1.905ms 0.000us 0.00% 21.632us 21.632us 1 - aten::to 0.35% 6.597us 84.97% 1.623ms 270.580us 0.000us 0.00% 13.728us 2.288us 6 - aten::_to_copy 1.27% 24.323us 84.63% 1.617ms 269.481us 0.000us 0.00% 13.728us 2.288us 6 - aten::copy_ 2.68% 51.130us 81.67% 1.560ms 260.072us 11.616us 59.51% 13.728us 2.288us 6 - aten::conv1d 0.33% 6.400us 6.43% 122.914us 40.971us 0.000us 0.00% 7.904us 2.635us 3 - aten::convolution 0.52% 9.901us 6.10% 116.514us 38.838us 0.000us 0.00% 7.904us 2.635us 3 - aten::_convolution 1.28% 24.410us 5.58% 106.613us 35.538us 0.000us 0.00% 7.904us 2.635us 3 - aten::_conv_depthwise2d 1.25% 23.932us 3.35% 63.983us 21.328us 7.904us 40.49% 7.904us 2.635us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.49% 7.904us 2.635us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 345.183us 1768.26% 345.183us 345.183us 1 + torch_eager 6.79% 129.712us 99.69% 1.904ms 1.904ms 0.000us 0.00% 21.633us 21.633us 1 + aten::to 0.35% 6.752us 84.92% 1.622ms 270.359us 0.000us 0.00% 13.697us 2.283us 6 + aten::_to_copy 1.29% 24.629us 84.57% 1.615ms 269.234us 0.000us 0.00% 13.697us 2.283us 6 + aten::copy_ 2.57% 49.181us 81.04% 1.548ms 258.016us 11.585us 59.35% 13.697us 2.283us 6 + aten::conv1d 0.34% 6.520us 6.51% 124.283us 41.428us 0.000us 0.00% 7.936us 2.645us 3 + aten::convolution 0.52% 9.860us 6.16% 117.763us 39.254us 0.000us 0.00% 7.936us 2.645us 3 + aten::_convolution 1.28% 24.503us 5.65% 107.903us 35.968us 0.000us 0.00% 7.936us 2.645us 3 + aten::_conv_depthwise2d 1.17% 22.379us 3.49% 66.751us 22.250us 7.936us 40.65% 7.936us 2.645us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 40.65% 7.936us 2.645us 3 void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.080us 31.15% 6.080us 2.027us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.536us 28.36% 5.536us 1.845us 3 - Activity Buffer Request 76.19% 1.456ms 76.19% 1.456ms 1.456ms 2.112us 10.82% 2.112us 2.112us 1 - aten::empty_strided 1.68% 32.131us 1.68% 32.131us 5.355us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.93% 75.003us 3.93% 75.003us 8.334us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.97% 18.540us 1.29% 24.620us 2.736us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.51% 9.711us 0.51% 9.711us 0.647us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.51% 9.650us 0.51% 9.650us 3.217us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.47% 9.000us 0.47% 9.000us 3.000us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.37% 7.100us 0.45% 8.560us 2.853us 0.000us 0.00% 0.000us 0.000us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.505us 28.20% 5.505us 1.835us 3 + Activity Buffer Request 75.86% 1.449ms 75.86% 1.449ms 1.449ms 2.112us 10.82% 2.112us 2.112us 1 + aten::empty_strided 2.23% 42.682us 2.23% 42.682us 7.114us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.91% 74.643us 3.91% 74.643us 8.294us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.96% 18.249us 1.26% 24.119us 2.680us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.50% 9.600us 0.50% 9.600us 0.640us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.51% 9.750us 0.51% 9.750us 3.250us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.51% 9.801us 0.51% 9.801us 3.267us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 5.930us 0.39% 7.450us 2.483us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.911ms -Self CUDA time total: 19.520us +Self CPU time total: 1.910ms +Self CUDA time total: 19.521us @@ -4046,29 +4046,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 379.390us 2047.55% 379.390us 379.390us 1 - torch_eager 8.20% 159.835us 99.65% 1.942ms 1.942ms 0.000us 0.00% 20.449us 20.449us 1 - aten::to 0.37% 7.179us 83.32% 1.624ms 270.686us 0.000us 0.00% 13.536us 2.256us 6 - aten::_to_copy 1.40% 27.213us 82.96% 1.617ms 269.489us 0.000us 0.00% 13.536us 2.256us 6 - aten::copy_ 2.62% 51.160us 79.92% 1.558ms 259.635us 11.616us 62.69% 13.536us 2.256us 6 - aten::conv1d 0.34% 6.560us 6.49% 126.453us 42.151us 0.000us 0.00% 6.913us 2.304us 3 - aten::convolution 0.57% 11.119us 6.15% 119.893us 39.964us 0.000us 0.00% 6.913us 2.304us 3 - aten::_convolution 1.29% 25.191us 5.58% 108.774us 36.258us 0.000us 0.00% 6.913us 2.304us 3 - aten::_conv_depthwise2d 1.16% 22.580us 3.36% 65.502us 21.834us 6.913us 37.31% 6.913us 2.304us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.913us 37.31% 6.913us 2.304us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 31.95% 5.920us 1.973us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.696us 30.74% 5.696us 1.899us 3 - Activity Buffer Request 74.82% 1.458ms 74.82% 1.458ms 1.458ms 1.920us 10.36% 1.920us 1.920us 1 - aten::empty_strided 1.64% 31.911us 1.64% 31.911us 5.319us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 3.59% 70.043us 3.59% 70.043us 7.783us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.01% 19.612us 1.35% 26.392us 2.932us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.55% 10.750us 0.55% 10.750us 0.717us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.62% 12.182us 0.62% 12.182us 4.061us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.46% 8.910us 0.46% 8.910us 2.970us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.35% 6.890us 0.42% 8.260us 2.753us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.236us 1887.05% 350.236us 350.236us 1 + torch_eager 6.95% 131.684us 99.72% 1.889ms 1.889ms 0.000us 0.00% 20.481us 20.481us 1 + aten::to 0.32% 5.979us 84.80% 1.606ms 267.646us 0.000us 0.00% 13.570us 2.262us 6 + aten::_to_copy 1.26% 23.830us 84.48% 1.600ms 266.649us 0.000us 0.00% 13.570us 2.262us 6 + aten::copy_ 3.12% 59.102us 81.66% 1.546ms 257.734us 11.649us 62.76% 13.570us 2.262us 6 + aten::conv1d 0.33% 6.189us 6.49% 122.822us 40.941us 0.000us 0.00% 6.911us 2.304us 3 + aten::convolution 0.53% 10.011us 6.16% 116.633us 38.878us 0.000us 0.00% 6.911us 2.304us 3 + aten::_convolution 1.28% 24.209us 5.63% 106.622us 35.541us 0.000us 0.00% 6.911us 2.304us 3 + aten::_conv_depthwise2d 1.23% 23.239us 3.44% 65.172us 21.724us 6.911us 37.24% 6.911us 2.304us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 6.911us 37.24% 6.911us 2.304us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.921us 31.90% 5.921us 1.974us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 30.86% 5.728us 1.909us 3 + Activity Buffer Request 75.86% 1.437ms 75.86% 1.437ms 1.437ms 1.921us 10.35% 1.921us 1.921us 1 + aten::empty_strided 1.57% 29.661us 1.57% 29.661us 4.944us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 3.93% 74.492us 3.93% 74.492us 8.277us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.98% 18.470us 1.28% 24.221us 2.691us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.50% 9.492us 0.50% 9.492us 0.633us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.52% 9.761us 0.52% 9.761us 3.254us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 8.371us 0.44% 8.371us 2.790us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 5.870us 0.39% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.949ms -Self CUDA time total: 18.529us +Self CPU time total: 1.894ms +Self CUDA time total: 18.560us @@ -4078,29 +4078,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.058us 1736.41% 340.058us 340.058us 1 - torch_eager 6.15% 129.375us 99.74% 2.097ms 2.097ms 0.000us 0.00% 21.760us 21.760us 1 - aten::to 0.32% 6.700us 86.45% 1.818ms 303.002us 0.000us 0.00% 14.112us 2.352us 6 - aten::_to_copy 1.17% 24.651us 86.13% 1.811ms 301.886us 0.000us 0.00% 14.112us 2.352us 6 - aten::copy_ 2.42% 50.883us 83.54% 1.757ms 292.785us 11.936us 60.95% 14.112us 2.352us 6 - aten::conv1d 0.30% 6.290us 5.74% 120.803us 40.268us 0.000us 0.00% 7.648us 2.549us 3 - aten::convolution 0.48% 10.020us 5.45% 114.513us 38.171us 0.000us 0.00% 7.648us 2.549us 3 - aten::_convolution 1.15% 24.209us 4.97% 104.493us 34.831us 0.000us 0.00% 7.648us 2.549us 3 - aten::_conv_depthwise2d 1.00% 21.080us 2.93% 61.691us 20.564us 7.648us 39.05% 7.648us 2.549us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.648us 39.05% 7.648us 2.549us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 31.70% 6.208us 2.069us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.728us 29.25% 5.728us 1.909us 3 - Activity Buffer Request 71.15% 1.496ms 71.15% 1.496ms 1.496ms 2.176us 11.11% 2.176us 2.176us 1 - aten::empty_strided 1.42% 29.951us 1.42% 29.951us 4.992us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.98% 230.807us 10.98% 230.807us 25.645us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.94% 19.863us 1.21% 25.543us 2.838us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.46% 9.630us 0.46% 9.630us 0.642us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.50% 10.541us 0.50% 10.541us 3.514us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.42% 8.810us 0.42% 8.810us 2.937us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.35% 7.411us 0.44% 9.201us 3.067us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 349.473us 1781.39% 349.473us 349.473us 1 + torch_eager 6.11% 131.525us 99.75% 2.146ms 2.146ms 0.000us 0.00% 21.795us 21.795us 1 + aten::to 0.31% 6.681us 86.66% 1.864ms 310.738us 0.000us 0.00% 14.148us 2.358us 6 + aten::_to_copy 1.14% 24.510us 86.35% 1.858ms 309.625us 0.000us 0.00% 14.148us 2.358us 6 + aten::copy_ 2.35% 50.532us 83.71% 1.801ms 300.153us 11.971us 61.02% 14.148us 2.358us 6 + aten::conv1d 0.29% 6.159us 5.69% 122.482us 40.827us 0.000us 0.00% 7.647us 2.549us 3 + aten::convolution 0.45% 9.650us 5.41% 116.323us 38.774us 0.000us 0.00% 7.647us 2.549us 3 + aten::_convolution 1.16% 25.049us 4.96% 106.673us 35.558us 0.000us 0.00% 7.647us 2.549us 3 + aten::_conv_depthwise2d 1.06% 22.843us 3.03% 65.182us 21.727us 7.647us 38.98% 7.647us 2.549us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.647us 38.98% 7.647us 2.549us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.211us 31.66% 6.211us 2.070us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.760us 29.36% 5.760us 1.920us 3 + Activity Buffer Request 68.59% 1.476ms 68.59% 1.476ms 1.476ms 2.177us 11.10% 2.177us 2.177us 1 + aten::empty_strided 1.50% 32.320us 1.50% 32.320us 5.387us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 13.84% 297.685us 13.84% 297.685us 33.076us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.81% 17.433us 1.07% 22.952us 2.550us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.42% 9.029us 0.42% 9.029us 0.602us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.47% 10.100us 0.47% 10.100us 3.367us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 9.389us 0.44% 9.389us 3.130us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.30% 6.350us 0.36% 7.690us 2.563us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.103ms -Self CUDA time total: 19.584us +Self CPU time total: 2.151ms +Self CUDA time total: 19.618us @@ -4110,29 +4110,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.070us 1381.53% 339.070us 339.070us 1 - torch_eager 6.44% 132.135us 99.72% 2.045ms 2.045ms 0.000us 0.00% 26.814us 26.814us 1 - aten::to 0.33% 6.722us 86.08% 1.765ms 294.155us 0.000us 0.00% 15.262us 2.544us 6 - aten::_to_copy 1.20% 24.702us 85.75% 1.758ms 293.035us 0.000us 0.00% 15.262us 2.544us 6 - aten::copy_ 2.39% 49.030us 83.04% 1.702ms 283.750us 12.991us 52.93% 15.262us 2.544us 6 - aten::conv1d 0.29% 5.850us 5.78% 118.603us 39.534us 0.000us 0.00% 11.552us 3.851us 3 - aten::convolution 0.55% 11.220us 5.50% 112.753us 37.584us 0.000us 0.00% 11.552us 3.851us 3 - aten::_convolution 1.18% 24.170us 4.95% 101.533us 33.844us 0.000us 0.00% 11.552us 3.851us 3 - aten::_conv_depthwise2d 1.08% 22.212us 2.99% 61.273us 20.424us 11.552us 47.07% 11.552us 3.851us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 47.07% 11.552us 3.851us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.655us 27.12% 6.655us 2.218us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 25.82% 6.336us 2.112us 3 - Activity Buffer Request 71.25% 1.461ms 71.25% 1.461ms 1.461ms 2.271us 9.25% 2.271us 2.271us 1 - aten::empty_strided 1.51% 31.010us 1.51% 31.010us 5.168us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.41% 213.527us 10.41% 213.527us 23.725us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.89% 18.350us 1.15% 23.660us 2.629us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 9.131us 0.45% 9.131us 0.609us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.46% 9.481us 0.46% 9.481us 3.160us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.43% 8.760us 0.43% 8.760us 2.920us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.27% 5.520us 0.33% 6.850us 2.283us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.958us 1429.22% 348.958us 348.958us 1 + torch_eager 6.64% 139.433us 99.75% 2.094ms 2.094ms 0.000us 0.00% 26.656us 26.656us 1 + aten::to 0.30% 6.349us 85.85% 1.803ms 300.420us 0.000us 0.00% 15.136us 2.523us 6 + aten::_to_copy 1.17% 24.664us 85.55% 1.796ms 299.362us 0.000us 0.00% 15.136us 2.523us 6 + aten::copy_ 2.46% 51.670us 82.81% 1.739ms 289.779us 12.896us 52.82% 15.136us 2.523us 6 + aten::conv1d 0.30% 6.230us 5.89% 123.663us 41.221us 0.000us 0.00% 11.520us 3.840us 3 + aten::convolution 0.49% 10.211us 5.59% 117.433us 39.144us 0.000us 0.00% 11.520us 3.840us 3 + aten::_convolution 1.21% 25.350us 5.11% 107.222us 35.741us 0.000us 0.00% 11.520us 3.840us 3 + aten::_conv_depthwise2d 1.07% 22.551us 3.09% 64.932us 21.644us 11.520us 47.18% 11.520us 3.840us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 47.18% 11.520us 3.840us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 27.00% 6.592us 2.197us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 25.82% 6.304us 2.101us 3 + Activity Buffer Request 67.91% 1.426ms 67.91% 1.426ms 1.426ms 2.240us 9.17% 2.240us 2.240us 1 + aten::empty_strided 1.56% 32.829us 1.56% 32.829us 5.472us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 13.56% 284.686us 13.56% 284.686us 31.632us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.89% 18.631us 1.15% 24.041us 2.671us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.44% 9.180us 0.44% 9.180us 0.612us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.47% 9.871us 0.47% 9.871us 3.290us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.43% 8.930us 0.43% 8.930us 2.977us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.31% 6.480us 0.38% 7.900us 2.633us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.050ms -Self CUDA time total: 24.543us +Self CPU time total: 2.100ms +Self CUDA time total: 24.416us @@ -4142,29 +4142,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.129us 1305.15% 339.129us 339.129us 1 - torch_eager 6.29% 128.886us 99.74% 2.043ms 2.043ms 0.000us 0.00% 28.224us 28.224us 1 - aten::to 0.34% 6.902us 86.10% 1.763ms 293.882us 0.000us 0.00% 15.168us 2.528us 6 - aten::_to_copy 1.23% 25.190us 85.76% 1.756ms 292.731us 0.000us 0.00% 15.168us 2.528us 6 - aten::copy_ 2.41% 49.270us 83.08% 1.701ms 283.571us 12.928us 49.75% 15.168us 2.528us 6 - aten::conv1d 0.31% 6.370us 5.92% 121.333us 40.444us 0.000us 0.00% 13.056us 4.352us 3 - aten::convolution 0.49% 10.120us 5.61% 114.963us 38.321us 0.000us 0.00% 13.056us 4.352us 3 - aten::_convolution 1.25% 25.500us 5.12% 104.843us 34.948us 0.000us 0.00% 13.056us 4.352us 3 - aten::_conv_depthwise2d 1.08% 22.212us 3.04% 62.243us 20.748us 13.056us 50.25% 13.056us 4.352us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 13.056us 50.25% 13.056us 4.352us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 25.37% 6.592us 2.197us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.336us 24.38% 6.336us 2.112us 3 - Activity Buffer Request 71.41% 1.463ms 71.41% 1.463ms 1.463ms 2.240us 8.62% 2.240us 2.240us 1 - aten::empty_strided 1.45% 29.770us 1.45% 29.770us 4.962us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 10.25% 209.968us 10.25% 209.968us 23.330us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.92% 18.870us 1.21% 24.780us 2.753us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.47% 9.601us 0.47% 9.601us 0.640us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.51% 10.510us 0.51% 10.510us 3.503us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.45% 9.181us 0.45% 9.181us 3.060us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.32% 6.640us 0.40% 8.140us 2.713us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.300us 1353.19% 350.300us 350.300us 1 + torch_eager 6.16% 129.673us 99.74% 2.100ms 2.100ms 0.000us 0.00% 28.127us 28.127us 1 + aten::to 0.30% 6.400us 86.28% 1.817ms 302.813us 0.000us 0.00% 15.135us 2.522us 6 + aten::_to_copy 1.17% 24.572us 85.97% 1.810ms 301.746us 0.000us 0.00% 15.135us 2.522us 6 + aten::copy_ 2.32% 48.831us 83.30% 1.754ms 292.358us 12.895us 49.81% 15.135us 2.522us 6 + aten::conv1d 0.30% 6.370us 5.91% 124.553us 41.518us 0.000us 0.00% 12.992us 4.331us 3 + aten::convolution 0.48% 10.021us 5.61% 118.183us 39.394us 0.000us 0.00% 12.992us 4.331us 3 + aten::_convolution 1.13% 23.790us 5.14% 108.162us 36.054us 0.000us 0.00% 12.992us 4.331us 3 + aten::_conv_depthwise2d 1.15% 24.221us 3.16% 66.582us 22.194us 12.992us 50.19% 12.992us 4.331us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 12.992us 50.19% 12.992us 4.331us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.592us 25.46% 6.592us 2.197us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.303us 24.35% 6.303us 2.101us 3 + Activity Buffer Request 68.95% 1.452ms 68.95% 1.452ms 1.452ms 2.240us 8.65% 2.240us 2.240us 1 + aten::empty_strided 1.51% 31.759us 1.51% 31.759us 5.293us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 13.13% 276.435us 13.13% 276.435us 30.715us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.91% 19.219us 1.21% 25.491us 2.832us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.48% 10.094us 0.48% 10.094us 0.673us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.47% 9.932us 0.47% 9.932us 3.311us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.44% 9.289us 0.44% 9.289us 3.096us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.33% 6.860us 0.40% 8.350us 2.783us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.048ms -Self CUDA time total: 25.984us +Self CPU time total: 2.106ms +Self CUDA time total: 25.887us @@ -4174,29 +4174,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 362.270us 942.63% 362.270us 362.270us 1 - torch_eager 7.50% 163.876us 99.75% 2.180ms 2.180ms 0.000us 0.00% 40.993us 40.993us 1 - aten::conv1d 0.34% 7.388us 5.94% 129.794us 43.265us 0.000us 0.00% 22.464us 7.488us 3 - aten::convolution 0.56% 12.301us 5.60% 122.406us 40.802us 0.000us 0.00% 22.464us 7.488us 3 - aten::_convolution 1.18% 25.829us 5.04% 110.105us 36.702us 0.000us 0.00% 22.464us 7.488us 3 - aten::_conv_depthwise2d 1.07% 23.371us 2.94% 64.311us 21.437us 22.464us 58.45% 22.464us 7.488us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.464us 58.45% 22.464us 7.488us 3 - aten::to 0.36% 7.830us 84.95% 1.856ms 309.406us 0.000us 0.00% 18.529us 3.088us 6 - aten::_to_copy 1.44% 31.560us 84.59% 1.849ms 308.101us 0.000us 0.00% 18.529us 3.088us 6 - aten::copy_ 2.41% 52.633us 81.64% 1.784ms 297.326us 15.968us 41.55% 18.529us 3.088us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.609us 22.40% 8.609us 2.870us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.359us 19.15% 7.359us 2.453us 3 - Activity Buffer Request 65.39% 1.429ms 65.39% 1.429ms 1.429ms 2.561us 6.66% 2.561us 2.561us 1 - aten::empty_strided 1.51% 33.091us 1.51% 33.091us 5.515us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 14.87% 325.052us 14.87% 325.052us 36.117us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.00% 21.833us 1.21% 26.523us 2.947us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.39% 8.492us 0.39% 8.492us 0.566us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.44% 9.570us 0.44% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.40% 8.750us 0.40% 8.750us 2.917us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.37% 7.980us 0.45% 9.772us 3.257us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 344.670us 904.34% 344.670us 344.670us 1 + torch_eager 6.27% 130.413us 99.73% 2.076ms 2.076ms 0.000us 0.00% 40.673us 40.673us 1 + aten::conv1d 0.29% 6.011us 5.81% 120.902us 40.301us 0.000us 0.00% 22.369us 7.456us 3 + aten::convolution 0.46% 9.579us 5.52% 114.891us 38.297us 0.000us 0.00% 22.369us 7.456us 3 + aten::_convolution 1.17% 24.271us 5.06% 105.312us 35.104us 0.000us 0.00% 22.369us 7.456us 3 + aten::_conv_depthwise2d 1.07% 22.281us 3.10% 64.540us 21.513us 22.369us 58.69% 22.369us 7.456us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.369us 58.69% 22.369us 7.456us 3 + aten::to 0.30% 6.240us 86.29% 1.796ms 299.368us 0.000us 0.00% 18.304us 3.051us 6 + aten::_to_copy 1.19% 24.702us 86.00% 1.790ms 298.328us 0.000us 0.00% 18.304us 3.051us 6 + aten::copy_ 2.32% 48.271us 83.37% 1.735ms 289.226us 15.744us 41.31% 18.304us 3.051us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.352us 21.91% 8.352us 2.784us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 19.39% 7.392us 2.464us 3 + Activity Buffer Request 69.09% 1.438ms 69.09% 1.438ms 1.438ms 2.560us 6.72% 2.560us 2.560us 1 + aten::empty_strided 1.44% 29.909us 1.44% 29.909us 4.985us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 13.10% 272.705us 13.10% 272.705us 30.301us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.90% 18.821us 1.17% 24.281us 2.698us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.43% 8.910us 0.43% 8.910us 0.594us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.47% 9.769us 0.47% 9.769us 3.256us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.42% 8.830us 0.42% 8.830us 2.943us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.30% 6.180us 0.36% 7.570us 2.523us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.185ms -Self CUDA time total: 38.432us +Self CPU time total: 2.081ms +Self CUDA time total: 38.113us @@ -4206,29 +4206,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.836us 827.74% 339.836us 339.836us 1 - torch_eager 6.54% 141.434us 99.74% 2.158ms 2.158ms 0.000us 0.00% 43.648us 43.648us 1 - aten::conv1d 0.28% 6.090us 5.53% 119.574us 39.858us 0.000us 0.00% 25.407us 8.469us 3 - aten::convolution 0.46% 9.939us 5.25% 113.484us 37.828us 0.000us 0.00% 25.407us 8.469us 3 - aten::_convolution 1.12% 24.214us 4.79% 103.545us 34.515us 0.000us 0.00% 25.407us 8.469us 3 - aten::_conv_depthwise2d 1.05% 22.612us 2.94% 63.593us 21.198us 25.407us 61.88% 25.407us 8.469us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.407us 61.88% 25.407us 8.469us 3 - aten::to 0.29% 6.201us 86.38% 1.869ms 311.424us 0.000us 0.00% 18.241us 3.040us 6 - aten::_to_copy 1.18% 25.424us 86.09% 1.862ms 310.391us 0.000us 0.00% 18.241us 3.040us 6 - aten::copy_ 2.40% 51.862us 83.52% 1.807ms 301.107us 15.649us 38.12% 18.241us 3.040us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 20.27% 8.320us 2.773us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.329us 17.85% 7.329us 2.443us 3 - Activity Buffer Request 68.07% 1.472ms 68.07% 1.472ms 1.472ms 2.592us 6.31% 2.592us 2.592us 1 - aten::empty_strided 1.40% 30.280us 1.40% 30.280us 5.047us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 14.06% 304.169us 14.06% 304.169us 33.797us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.84% 18.230us 1.08% 23.418us 2.602us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.40% 8.619us 0.40% 8.619us 0.575us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.48% 10.370us 0.48% 10.370us 3.457us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.41% 8.770us 0.41% 8.770us 2.923us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.26% 5.659us 0.32% 6.990us 2.330us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.324us 829.06% 339.324us 339.324us 1 + torch_eager 6.37% 130.712us 99.71% 2.046ms 2.046ms 0.000us 0.00% 43.521us 43.521us 1 + aten::conv1d 0.29% 5.880us 5.94% 121.953us 40.651us 0.000us 0.00% 25.216us 8.405us 3 + aten::convolution 0.47% 9.711us 5.66% 116.073us 38.691us 0.000us 0.00% 25.216us 8.405us 3 + aten::_convolution 1.26% 25.911us 5.18% 106.362us 35.454us 0.000us 0.00% 25.216us 8.405us 3 + aten::_conv_depthwise2d 1.09% 22.379us 3.16% 64.832us 21.611us 25.216us 61.61% 25.216us 8.405us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 25.216us 61.61% 25.216us 8.405us 3 + aten::to 0.29% 5.930us 86.07% 1.766ms 294.313us 0.000us 0.00% 18.305us 3.051us 6 + aten::_to_copy 1.14% 23.292us 85.79% 1.760ms 293.325us 0.000us 0.00% 18.305us 3.051us 6 + aten::copy_ 2.44% 50.149us 83.18% 1.707ms 284.430us 15.713us 38.39% 18.305us 3.051us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 20.33% 8.320us 2.773us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.393us 18.06% 7.393us 2.464us 3 + Activity Buffer Request 69.04% 1.416ms 69.04% 1.416ms 1.416ms 2.592us 6.33% 2.592us 2.592us 1 + aten::empty_strided 1.47% 30.081us 1.47% 30.081us 5.013us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 12.82% 263.078us 12.82% 263.078us 29.231us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.84% 17.161us 1.08% 22.249us 2.472us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.43% 8.738us 0.43% 8.738us 0.583us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.52% 10.621us 0.52% 10.621us 3.540us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.43% 8.801us 0.43% 8.801us 2.934us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.28% 5.670us 0.35% 7.160us 2.387us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.163ms -Self CUDA time total: 41.056us +Self CPU time total: 2.052ms +Self CUDA time total: 40.929us @@ -4238,29 +4238,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 338.560us 329.80% 338.560us 338.560us 1 - torch_eager 6.25% 131.427us 99.74% 2.098ms 2.098ms 0.000us 0.00% 108.608us 108.608us 1 - aten::conv1d 0.29% 6.110us 5.71% 120.083us 40.028us 0.000us 0.00% 70.496us 23.499us 3 - aten::convolution 0.47% 9.940us 5.42% 113.973us 37.991us 0.000us 0.00% 70.496us 23.499us 3 - aten::_convolution 1.11% 23.441us 4.94% 104.033us 34.678us 0.000us 0.00% 70.496us 23.499us 3 - aten::_conv_depthwise2d 1.04% 21.830us 2.93% 61.652us 20.551us 70.496us 68.67% 70.496us 23.499us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.496us 68.67% 70.496us 23.499us 3 - aten::to 0.30% 6.292us 86.43% 1.818ms 303.059us 0.000us 0.00% 38.112us 6.352us 6 - aten::_to_copy 1.17% 24.539us 86.13% 1.812ms 302.010us 0.000us 0.00% 38.112us 6.352us 6 - aten::copy_ 2.47% 51.869us 83.58% 1.758ms 293.072us 32.160us 31.33% 38.112us 6.352us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.568us 17.11% 17.568us 5.856us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.592us 14.21% 14.592us 4.864us 3 - Activity Buffer Request 67.63% 1.423ms 67.63% 1.423ms 1.423ms 5.952us 5.80% 5.952us 5.952us 1 - aten::empty_strided 1.38% 29.091us 1.38% 29.091us 4.849us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 14.47% 304.542us 14.47% 304.542us 33.838us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.91% 19.049us 1.17% 24.579us 2.731us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.43% 9.070us 0.43% 9.070us 0.605us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.49% 10.351us 0.49% 10.351us 3.450us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.41% 8.621us 0.41% 8.621us 2.874us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.38% 8.050us 0.45% 9.470us 3.157us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 346.587us 339.00% 346.587us 346.587us 1 + torch_eager 6.07% 126.382us 99.75% 2.075ms 2.075ms 0.000us 0.00% 108.223us 108.223us 1 + aten::conv1d 0.27% 5.689us 5.84% 121.563us 40.521us 0.000us 0.00% 70.111us 23.370us 3 + aten::convolution 0.45% 9.432us 5.57% 115.874us 38.625us 0.000us 0.00% 70.111us 23.370us 3 + aten::_convolution 1.15% 23.992us 5.12% 106.442us 35.481us 0.000us 0.00% 70.111us 23.370us 3 + aten::_conv_depthwise2d 1.13% 23.510us 3.19% 66.451us 22.150us 70.111us 68.58% 70.111us 23.370us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 70.111us 68.58% 70.111us 23.370us 3 + aten::to 0.33% 6.762us 86.49% 1.799ms 299.916us 0.000us 0.00% 38.112us 6.352us 6 + aten::_to_copy 1.17% 24.419us 86.17% 1.793ms 298.789us 0.000us 0.00% 38.112us 6.352us 6 + aten::copy_ 2.24% 46.671us 83.54% 1.738ms 289.665us 32.128us 31.42% 38.112us 6.352us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.568us 17.18% 17.568us 5.856us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.560us 14.24% 14.560us 4.853us 3 + Activity Buffer Request 69.85% 1.453ms 69.85% 1.453ms 1.453ms 5.984us 5.85% 5.984us 5.984us 1 + aten::empty_strided 1.46% 30.330us 1.46% 30.330us 5.055us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 12.58% 261.816us 12.58% 261.816us 29.091us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.87% 18.161us 1.14% 23.661us 2.629us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.44% 9.119us 0.44% 9.119us 0.608us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.48% 9.940us 0.48% 9.940us 3.313us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.45% 9.280us 0.45% 9.280us 3.093us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.27% 5.680us 0.34% 7.119us 2.373us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.104ms -Self CUDA time total: 102.656us +Self CPU time total: 2.081ms +Self CUDA time total: 102.239us @@ -4270,29 +4270,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.578us 301.93% 340.578us 340.578us 1 - torch_eager 6.29% 133.214us 99.74% 2.113ms 2.113ms 0.000us 0.00% 118.752us 118.752us 1 - aten::conv1d 0.31% 6.499us 5.66% 119.974us 39.991us 0.000us 0.00% 80.576us 26.859us 3 - aten::convolution 0.47% 9.880us 5.36% 113.475us 37.825us 0.000us 0.00% 80.576us 26.859us 3 - aten::_convolution 1.21% 25.730us 4.89% 103.595us 34.532us 0.000us 0.00% 80.576us 26.859us 3 - aten::_conv_depthwise2d 1.01% 21.361us 2.87% 60.832us 20.277us 80.576us 71.43% 80.576us 26.859us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.576us 71.43% 80.576us 26.859us 3 - aten::to 0.33% 7.060us 86.42% 1.831ms 305.149us 0.000us 0.00% 38.176us 6.363us 6 - aten::_to_copy 1.15% 24.352us 86.09% 1.824ms 303.972us 0.000us 0.00% 38.176us 6.363us 6 - aten::copy_ 2.34% 49.642us 83.57% 1.770ms 295.075us 32.224us 28.57% 38.176us 6.363us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.664us 15.66% 17.664us 5.888us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.560us 12.91% 14.560us 4.853us 3 - Activity Buffer Request 68.62% 1.454ms 68.62% 1.454ms 1.454ms 5.952us 5.28% 5.952us 5.952us 1 - aten::empty_strided 1.37% 29.031us 1.37% 29.031us 4.838us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 13.59% 287.970us 13.59% 287.970us 31.997us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.89% 18.772us 1.17% 24.871us 2.763us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 9.520us 0.45% 9.520us 0.635us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.46% 9.850us 0.46% 9.850us 3.283us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.41% 8.670us 0.41% 8.670us 2.890us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.32% 6.821us 0.38% 8.112us 2.704us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.895us 303.69% 340.895us 340.895us 1 + torch_eager 5.96% 121.922us 99.72% 2.040ms 2.040ms 0.000us 0.00% 118.204us 118.204us 1 + aten::conv1d 0.29% 5.851us 5.96% 121.923us 40.641us 0.000us 0.00% 80.190us 26.730us 3 + aten::convolution 0.47% 9.659us 5.67% 116.072us 38.691us 0.000us 0.00% 80.190us 26.730us 3 + aten::_convolution 1.15% 23.552us 5.20% 106.413us 35.471us 0.000us 0.00% 80.190us 26.730us 3 + aten::_conv_depthwise2d 1.14% 23.240us 3.13% 64.041us 21.347us 80.190us 71.44% 80.190us 26.730us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 80.190us 71.44% 80.190us 26.730us 3 + aten::to 0.30% 6.190us 86.45% 1.769ms 294.821us 0.000us 0.00% 38.014us 6.336us 6 + aten::_to_copy 1.15% 23.531us 86.15% 1.763ms 293.790us 0.000us 0.00% 38.014us 6.336us 6 + aten::copy_ 2.44% 49.841us 83.49% 1.708ms 284.726us 32.062us 28.56% 38.014us 6.336us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 17.503us 15.59% 17.503us 5.834us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 14.559us 12.97% 14.559us 4.853us 3 + Activity Buffer Request 69.71% 1.426ms 69.71% 1.426ms 1.426ms 5.952us 5.30% 5.952us 5.952us 1 + aten::empty_strided 1.51% 30.850us 1.51% 30.850us 5.142us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 12.43% 254.276us 12.43% 254.276us 28.253us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.92% 18.780us 1.20% 24.600us 2.733us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.47% 9.541us 0.47% 9.541us 0.636us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.49% 10.090us 0.49% 10.090us 3.363us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.42% 8.680us 0.42% 8.680us 2.893us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.34% 6.870us 0.41% 8.300us 2.767us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.119ms -Self CUDA time total: 112.800us +Self CPU time total: 2.046ms +Self CUDA time total: 112.252us @@ -4302,29 +4302,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 6.32% 133.665us 99.60% 2.106ms 2.106ms 0.000us 0.00% 433.181us 433.181us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 423.869us 107.93% 423.869us 423.869us 1 - aten::conv1d 0.30% 6.441us 5.98% 126.475us 42.158us 0.000us 0.00% 252.190us 84.063us 3 - aten::convolution 0.49% 10.391us 5.68% 120.034us 40.011us 0.000us 0.00% 252.190us 84.063us 3 - aten::_convolution 1.19% 25.110us 5.19% 109.643us 36.548us 0.000us 0.00% 252.190us 84.063us 3 - aten::_conv_depthwise2d 1.07% 22.550us 3.14% 66.363us 22.121us 252.190us 64.21% 252.190us 84.063us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 252.190us 64.21% 252.190us 84.063us 3 - aten::to 0.33% 6.989us 85.86% 1.815ms 302.520us 0.000us 0.00% 180.991us 30.165us 6 - aten::_to_copy 1.18% 24.921us 85.53% 1.808ms 301.355us 0.000us 0.00% 180.991us 30.165us 6 - aten::copy_ 2.39% 50.532us 82.93% 1.753ms 292.204us 140.543us 35.79% 180.991us 30.165us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 100.768us 25.66% 100.768us 33.589us 3 - Activity Buffer Request 67.47% 1.426ms 67.47% 1.426ms 1.426ms 40.448us 10.30% 40.448us 40.448us 1 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.775us 10.13% 39.775us 13.258us 3 - aten::empty_strided 1.42% 29.990us 1.42% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 14.15% 299.142us 14.15% 299.142us 33.238us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.92% 19.400us 1.21% 25.500us 2.833us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.49% 10.430us 0.49% 10.430us 0.695us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.55% 11.580us 0.55% 11.580us 3.860us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.44% 9.361us 0.44% 9.361us 3.120us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.34% 7.110us 0.42% 8.900us 2.967us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 6.19% 128.263us 98.22% 2.035ms 2.035ms 0.000us 0.00% 432.800us 432.800us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 422.880us 107.61% 422.880us 422.880us 1 + aten::conv1d 0.31% 6.390us 5.89% 122.123us 40.708us 0.000us 0.00% 250.976us 83.659us 3 + aten::convolution 0.46% 9.601us 5.59% 115.733us 38.578us 0.000us 0.00% 250.976us 83.659us 3 + aten::_convolution 1.21% 25.079us 5.12% 106.132us 35.377us 0.000us 0.00% 250.976us 83.659us 3 + aten::_conv_depthwise2d 1.14% 23.570us 3.11% 64.391us 21.464us 250.976us 63.87% 250.976us 83.659us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 250.976us 63.87% 250.976us 83.659us 3 + aten::to 0.32% 6.560us 84.78% 1.757ms 292.786us 0.000us 0.00% 181.824us 30.304us 6 + aten::_to_copy 1.18% 24.419us 84.47% 1.750ms 291.693us 0.000us 0.00% 181.824us 30.304us 6 + aten::copy_ 2.44% 50.653us 81.84% 1.696ms 282.633us 141.984us 36.13% 181.824us 30.304us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 102.336us 26.04% 102.336us 34.112us 3 + Activity Buffer Request 68.54% 1.420ms 68.54% 1.420ms 1.420ms 39.840us 10.14% 39.840us 39.840us 1 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.648us 10.09% 39.648us 13.216us 3 + aten::empty_strided 1.44% 29.940us 1.44% 29.940us 4.990us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.90% 246.595us 11.90% 246.595us 27.399us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.85% 17.582us 1.11% 22.961us 2.551us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.44% 9.098us 0.44% 9.098us 0.607us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.47% 9.750us 0.47% 9.750us 3.250us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.45% 9.420us 0.45% 9.420us 3.140us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.32% 6.712us 0.39% 8.162us 2.721us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.114ms -Self CUDA time total: 392.733us +Self CPU time total: 2.072ms +Self CUDA time total: 392.960us @@ -4334,29 +4334,29 @@ PROFILE TRACE: torch_eager | cuda_B2_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 6.65% 143.166us 97.03% 2.090ms 2.090ms 0.000us 0.00% 486.301us 486.301us 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 477.853us 106.88% 477.853us 477.853us 1 - aten::conv1d 0.33% 7.110us 5.88% 126.575us 42.192us 0.000us 0.00% 298.557us 99.519us 3 - aten::convolution 0.51% 11.062us 5.55% 119.465us 39.822us 0.000us 0.00% 298.557us 99.519us 3 - aten::_convolution 1.16% 25.071us 5.03% 108.403us 36.134us 0.000us 0.00% 298.557us 99.519us 3 - aten::_conv_depthwise2d 1.05% 22.671us 3.05% 65.592us 21.864us 298.557us 66.78% 298.557us 99.519us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 298.557us 66.78% 298.557us 99.519us 3 - aten::to 0.33% 7.030us 83.12% 1.790ms 298.407us 0.000us 0.00% 187.744us 31.291us 6 - aten::_to_copy 1.22% 26.183us 82.80% 1.783ms 297.235us 0.000us 0.00% 187.744us 31.291us 6 - aten::copy_ 2.41% 51.979us 80.11% 1.726ms 287.603us 148.544us 33.22% 187.744us 31.291us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 108.768us 24.33% 108.768us 36.256us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.776us 8.90% 39.776us 13.259us 3 - Activity Buffer Request 66.10% 1.424ms 66.10% 1.424ms 1.424ms 39.200us 8.77% 39.200us 39.200us 1 - aten::empty_strided 1.47% 31.611us 1.47% 31.611us 5.268us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 12.61% 271.569us 12.61% 271.569us 30.174us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.93% 19.971us 1.21% 26.011us 2.890us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.45% 9.711us 0.45% 9.711us 0.647us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.47% 10.061us 0.47% 10.061us 3.354us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.51% 11.040us 0.51% 11.040us 3.680us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.28% 5.950us 0.34% 7.400us 2.467us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 5.97% 128.995us 95.86% 2.073ms 2.073ms 0.000us 0.00% 487.835us 487.835us 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 481.723us 107.54% 481.723us 481.723us 1 + aten::conv1d 0.29% 6.320us 5.75% 124.323us 41.441us 0.000us 0.00% 300.092us 100.031us 3 + aten::convolution 0.47% 10.180us 5.46% 118.003us 39.334us 0.000us 0.00% 300.092us 100.031us 3 + aten::_convolution 1.09% 23.583us 4.99% 107.823us 35.941us 0.000us 0.00% 300.092us 100.031us 3 + aten::_conv_depthwise2d 1.05% 22.771us 3.07% 66.451us 22.150us 300.092us 67.00% 300.092us 100.031us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 300.092us 67.00% 300.092us 100.031us 3 + aten::to 0.32% 6.900us 82.82% 1.791ms 298.496us 0.000us 0.00% 187.743us 31.290us 6 + aten::_to_copy 1.13% 24.450us 82.50% 1.784ms 297.346us 0.000us 0.00% 187.743us 31.290us 6 + aten::copy_ 2.37% 51.149us 79.94% 1.729ms 288.123us 147.839us 33.00% 187.743us 31.290us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 107.872us 24.08% 107.872us 35.957us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 39.967us 8.92% 39.967us 13.322us 3 + Activity Buffer Request 67.34% 1.456ms 67.34% 1.456ms 1.456ms 39.904us 8.91% 39.904us 39.904us 1 + aten::empty_strided 1.43% 30.891us 1.43% 30.891us 5.149us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.33% 245.015us 11.33% 245.015us 27.224us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.88% 18.930us 1.15% 24.910us 2.768us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.45% 9.739us 0.45% 9.739us 0.649us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.47% 10.080us 0.47% 10.080us 3.360us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.46% 9.870us 0.46% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.30% 6.460us 0.36% 7.889us 2.630us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.154ms -Self CUDA time total: 447.101us +Self CPU time total: 2.162ms +Self CUDA time total: 447.931us @@ -4366,29 +4366,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 355.165us 1897.25% 355.165us 355.165us 1 - torch_eager 15.24% 136.376us 99.32% 888.600us 888.600us 0.000us 0.00% 20.608us 20.608us 1 - aten::to 0.80% 7.121us 66.93% 598.831us 99.805us 0.000us 0.00% 13.376us 2.229us 6 - aten::_to_copy 2.95% 26.380us 66.13% 591.710us 98.618us 0.000us 0.00% 13.376us 2.229us 6 - aten::copy_ 5.90% 52.793us 59.34% 530.948us 88.491us 11.488us 61.37% 13.376us 2.229us 6 - aten::conv1d 0.68% 6.050us 13.88% 124.163us 41.388us 0.000us 0.00% 7.232us 2.411us 3 - aten::convolution 1.23% 10.987us 13.20% 118.113us 39.371us 0.000us 0.00% 7.232us 2.411us 3 - aten::_convolution 2.78% 24.854us 11.97% 107.126us 35.709us 0.000us 0.00% 7.232us 2.411us 3 - aten::_conv_depthwise2d 2.73% 24.470us 7.32% 65.481us 21.827us 7.232us 38.63% 7.232us 2.411us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.232us 38.63% 7.232us 2.411us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.920us 31.62% 5.920us 1.973us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.568us 29.74% 5.568us 1.856us 3 - Activity Buffer Request 26.68% 238.708us 26.68% 238.708us 238.708us 1.888us 10.09% 1.888us 1.888us 1 - aten::empty_strided 3.84% 34.382us 3.84% 34.382us 5.730us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 29.10% 260.398us 29.10% 260.398us 28.933us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.02% 18.071us 2.57% 22.961us 2.551us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.97% 8.709us 0.97% 8.709us 0.581us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.22% 10.910us 1.22% 10.910us 3.637us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.02% 9.150us 1.02% 9.150us 3.050us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.75% 6.751us 0.92% 8.220us 2.740us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 336.383us 1799.90% 336.383us 336.383us 1 + torch_eager 14.70% 125.222us 99.35% 846.539us 846.539us 0.000us 0.00% 20.577us 20.577us 1 + aten::to 0.73% 6.218us 67.41% 574.433us 95.739us 0.000us 0.00% 13.344us 2.224us 6 + aten::_to_copy 2.79% 23.792us 66.68% 568.215us 94.702us 0.000us 0.00% 13.344us 2.224us 6 + aten::copy_ 6.26% 53.310us 60.38% 514.471us 85.745us 11.456us 61.30% 13.344us 2.224us 6 + aten::conv1d 0.70% 5.960us 14.03% 119.583us 39.861us 0.000us 0.00% 7.233us 2.411us 3 + aten::convolution 1.15% 9.760us 13.33% 113.623us 37.874us 0.000us 0.00% 7.233us 2.411us 3 + aten::_convolution 2.80% 23.881us 12.19% 103.863us 34.621us 0.000us 0.00% 7.233us 2.411us 3 + aten::_conv_depthwise2d 2.66% 22.671us 7.54% 64.252us 21.417us 7.233us 38.70% 7.233us 2.411us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.233us 38.70% 7.233us 2.411us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 31.33% 5.856us 1.952us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.600us 29.96% 5.600us 1.867us 3 + Activity Buffer Request 28.19% 240.205us 28.19% 240.205us 240.205us 1.888us 10.10% 1.888us 1.888us 1 + aten::empty_strided 3.52% 29.952us 3.52% 29.952us 4.992us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 28.71% 244.666us 28.71% 244.666us 27.185us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.06% 17.560us 2.66% 22.660us 2.518us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.01% 8.590us 1.01% 8.590us 0.573us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.10% 9.390us 1.10% 9.390us 3.130us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.00% 8.481us 1.00% 8.481us 2.827us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.71% 6.030us 0.87% 7.390us 2.463us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 894.710us -Self CUDA time total: 18.720us +Self CPU time total: 852.119us +Self CUDA time total: 18.689us @@ -4398,29 +4398,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 323.578us 1674.05% 323.578us 323.578us 1 - torch_eager 14.45% 120.436us 99.39% 828.559us 828.559us 0.000us 0.00% 21.217us 21.217us 1 - aten::to 0.75% 6.271us 67.77% 564.939us 94.156us 0.000us 0.00% 13.377us 2.230us 6 - aten::_to_copy 2.76% 22.992us 67.02% 558.668us 93.111us 0.000us 0.00% 13.377us 2.230us 6 - aten::copy_ 5.96% 49.722us 60.74% 506.327us 84.388us 11.489us 59.44% 13.377us 2.230us 6 - aten::conv1d 0.75% 6.211us 13.83% 115.254us 38.418us 0.000us 0.00% 7.840us 2.613us 3 - aten::convolution 1.19% 9.930us 13.08% 109.043us 36.348us 0.000us 0.00% 7.840us 2.613us 3 - aten::_convolution 2.77% 23.131us 11.89% 99.113us 33.038us 0.000us 0.00% 7.840us 2.613us 3 - aten::_conv_depthwise2d 2.53% 21.092us 7.21% 60.132us 20.044us 7.840us 40.56% 7.840us 2.613us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.840us 40.56% 7.840us 2.613us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.857us 30.30% 5.857us 1.952us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.632us 29.14% 5.632us 1.877us 3 - Activity Buffer Request 27.26% 227.207us 27.26% 227.207us 227.207us 1.888us 9.77% 1.888us 1.888us 1 - aten::empty_strided 3.52% 29.349us 3.52% 29.349us 4.891us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 29.92% 249.418us 29.92% 249.418us 27.713us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.13% 17.749us 2.80% 23.370us 2.597us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.11% 9.261us 1.11% 9.261us 0.617us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.16% 9.660us 1.16% 9.660us 3.220us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.12% 9.360us 1.12% 9.360us 3.120us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.70% 5.810us 0.88% 7.370us 2.457us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 337.210us 1736.14% 337.210us 337.210us 1 + torch_eager 14.23% 122.135us 99.33% 852.339us 852.339us 0.000us 0.00% 21.279us 21.279us 1 + aten::to 0.69% 5.930us 67.81% 581.931us 96.989us 0.000us 0.00% 13.375us 2.229us 6 + aten::_to_copy 2.77% 23.810us 67.12% 576.001us 96.000us 0.000us 0.00% 13.375us 2.229us 6 + aten::copy_ 5.94% 51.010us 60.80% 521.760us 86.960us 11.519us 59.31% 13.375us 2.229us 6 + aten::conv1d 0.66% 5.690us 14.16% 121.503us 40.501us 0.000us 0.00% 7.904us 2.635us 3 + aten::convolution 1.11% 9.501us 13.50% 115.813us 38.604us 0.000us 0.00% 7.904us 2.635us 3 + aten::_convolution 3.02% 25.902us 12.39% 106.312us 35.437us 0.000us 0.00% 7.904us 2.635us 3 + aten::_conv_depthwise2d 2.72% 23.320us 7.45% 63.940us 21.313us 7.904us 40.69% 7.904us 2.635us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 40.69% 7.904us 2.635us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 5.855us 30.14% 5.855us 1.952us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664us 29.16% 5.664us 1.888us 3 + Activity Buffer Request 28.95% 248.446us 28.95% 248.446us 248.446us 1.856us 9.56% 1.856us 1.856us 1 + aten::empty_strided 3.55% 30.431us 3.55% 30.431us 5.072us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 28.34% 243.224us 28.34% 243.224us 27.025us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 1.98% 16.978us 2.58% 22.140us 2.460us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.00% 8.573us 1.00% 8.573us 0.572us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.13% 9.660us 1.13% 9.660us 3.220us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.17% 10.040us 1.17% 10.040us 3.347us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.78% 6.699us 0.93% 7.990us 2.663us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 833.619us -Self CUDA time total: 19.329us +Self CPU time total: 858.129us +Self CUDA time total: 19.423us @@ -4430,29 +4430,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 326.394us 1677.60% 326.394us 326.394us 1 - torch_eager 14.78% 122.914us 99.34% 825.919us 825.919us 0.000us 0.00% 21.632us 21.632us 1 - aten::to 0.79% 6.552us 67.16% 558.381us 93.064us 0.000us 0.00% 14.368us 2.395us 6 - aten::_to_copy 2.94% 24.430us 66.37% 551.829us 91.971us 0.000us 0.00% 14.368us 2.395us 6 - aten::copy_ 5.83% 48.462us 59.95% 498.427us 83.071us 12.192us 62.66% 14.368us 2.395us 6 - aten::conv1d 0.71% 5.939us 14.00% 116.404us 38.801us 0.000us 0.00% 7.264us 2.421us 3 - aten::convolution 1.18% 9.811us 13.29% 110.465us 36.822us 0.000us 0.00% 7.264us 2.421us 3 - aten::_convolution 2.85% 23.732us 12.11% 100.654us 33.551us 0.000us 0.00% 7.264us 2.421us 3 - aten::_conv_depthwise2d 2.52% 20.910us 7.24% 60.232us 20.077us 7.264us 37.34% 7.264us 2.421us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.264us 37.34% 7.264us 2.421us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.304us 32.40% 6.304us 2.101us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 30.26% 5.888us 1.963us 3 - Activity Buffer Request 26.68% 221.788us 26.68% 221.788us 221.788us 2.176us 11.18% 2.176us 2.176us 1 - aten::empty_strided 3.48% 28.972us 3.48% 28.972us 4.829us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 30.05% 249.819us 30.05% 249.819us 27.758us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.04% 16.929us 2.67% 22.200us 2.467us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.07% 8.901us 1.07% 8.901us 0.593us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.15% 9.570us 1.15% 9.570us 3.190us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.98% 8.110us 0.98% 8.110us 2.703us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.86% 7.190us 1.02% 8.500us 2.833us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 340.632us 1762.47% 340.632us 340.632us 1 + torch_eager 14.22% 122.814us 99.35% 857.879us 857.879us 0.000us 0.00% 21.503us 21.503us 1 + aten::to 0.71% 6.160us 68.06% 587.732us 97.955us 0.000us 0.00% 14.304us 2.384us 6 + aten::_to_copy 2.69% 23.228us 67.35% 581.572us 96.929us 0.000us 0.00% 14.304us 2.384us 6 + aten::copy_ 5.95% 51.401us 60.88% 525.681us 87.614us 12.128us 62.75% 14.304us 2.384us 6 + aten::conv1d 0.72% 6.190us 13.86% 119.652us 39.884us 0.000us 0.00% 7.199us 2.400us 3 + aten::convolution 1.11% 9.620us 13.14% 113.462us 37.821us 0.000us 0.00% 7.199us 2.400us 3 + aten::_convolution 2.71% 23.420us 12.03% 103.842us 34.614us 0.000us 0.00% 7.199us 2.400us 3 + aten::_conv_depthwise2d 2.67% 23.041us 7.39% 63.831us 21.277us 7.199us 37.25% 7.199us 2.400us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.199us 37.25% 7.199us 2.400us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.272us 32.45% 6.272us 2.091us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.856us 30.30% 5.856us 1.952us 3 + Activity Buffer Request 29.60% 255.626us 29.60% 255.626us 255.626us 2.176us 11.26% 2.176us 2.176us 1 + aten::empty_strided 3.78% 32.663us 3.78% 32.663us 5.444us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 27.93% 241.174us 27.93% 241.174us 26.797us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.07% 17.891us 2.69% 23.211us 2.579us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.04% 8.951us 1.04% 8.951us 0.597us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.14% 9.880us 1.14% 9.880us 3.293us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.97% 8.390us 0.97% 8.390us 2.797us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.77% 6.630us 0.93% 8.000us 2.667us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 831.399us -Self CUDA time total: 19.456us +Self CPU time total: 863.509us +Self CUDA time total: 19.327us @@ -4462,29 +4462,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 356.696us 1774.96% 356.696us 356.696us 1 - torch_eager 13.86% 123.804us 99.36% 887.440us 887.440us 0.000us 0.00% 22.272us 22.272us 1 - aten::to 0.71% 6.320us 66.62% 595.061us 99.177us 0.000us 0.00% 14.368us 2.395us 6 - aten::_to_copy 2.82% 25.151us 65.92% 588.741us 98.124us 0.000us 0.00% 14.368us 2.395us 6 - aten::copy_ 5.73% 51.172us 59.67% 532.958us 88.826us 12.192us 60.67% 14.368us 2.395us 6 - aten::conv1d 0.70% 6.210us 15.70% 140.195us 46.732us 0.000us 0.00% 7.904us 2.635us 3 - aten::convolution 1.11% 9.881us 15.00% 133.985us 44.662us 0.000us 0.00% 7.904us 2.635us 3 - aten::_convolution 2.74% 24.510us 13.89% 124.104us 41.368us 0.000us 0.00% 7.904us 2.635us 3 - aten::_conv_depthwise2d 2.70% 24.090us 9.26% 82.742us 27.581us 7.904us 39.33% 7.904us 2.635us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 7.904us 39.33% 7.904us 2.635us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.240us 31.05% 6.240us 2.080us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 29.62% 5.952us 1.984us 3 - Activity Buffer Request 28.94% 258.459us 28.94% 258.459us 258.459us 2.176us 10.83% 2.176us 2.176us 1 - aten::empty_strided 3.43% 30.632us 3.43% 30.632us 5.105us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 29.46% 263.129us 29.46% 263.129us 29.237us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.97% 17.620us 2.61% 23.310us 2.590us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.07% 9.580us 1.07% 9.580us 0.639us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.09% 9.720us 1.09% 9.720us 3.240us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.02% 9.130us 1.02% 9.130us 3.043us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.75% 6.702us 0.94% 8.422us 2.807us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 334.363us 1661.27% 334.363us 334.363us 1 + torch_eager 14.65% 121.213us 99.39% 822.628us 822.628us 0.000us 0.00% 22.271us 22.271us 1 + aten::to 0.73% 6.022us 66.87% 553.441us 92.240us 0.000us 0.00% 14.239us 2.373us 6 + aten::_to_copy 2.76% 22.839us 66.14% 547.419us 91.237us 0.000us 0.00% 14.239us 2.373us 6 + aten::copy_ 6.10% 50.480us 59.81% 495.040us 82.507us 12.095us 60.09% 14.239us 2.373us 6 + aten::conv1d 0.71% 5.911us 14.57% 120.603us 40.201us 0.000us 0.00% 8.032us 2.677us 3 + aten::convolution 1.15% 9.530us 13.86% 114.692us 38.231us 0.000us 0.00% 8.032us 2.677us 3 + aten::_convolution 2.90% 23.998us 12.71% 105.162us 35.054us 0.000us 0.00% 8.032us 2.677us 3 + aten::_conv_depthwise2d 2.69% 22.281us 7.85% 64.952us 21.651us 8.032us 39.91% 8.032us 2.677us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 8.032us 39.91% 8.032us 2.677us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.207us 30.84% 6.207us 2.069us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.888us 29.25% 5.888us 1.963us 3 + Activity Buffer Request 27.45% 227.155us 27.45% 227.155us 227.155us 2.144us 10.65% 2.144us 2.144us 1 + aten::empty_strided 3.57% 29.540us 3.57% 29.540us 4.923us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 29.06% 240.556us 29.06% 240.556us 26.728us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.13% 17.627us 2.77% 22.910us 2.546us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.09% 9.014us 1.09% 9.014us 0.601us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.17% 9.710us 1.17% 9.710us 3.237us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.19% 9.810us 1.19% 9.810us 3.270us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.72% 5.931us 0.90% 7.422us 2.474us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 893.171us -Self CUDA time total: 20.096us +Self CPU time total: 827.669us +Self CUDA time total: 20.127us @@ -4494,29 +4494,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 332.730us 926.72% 332.730us 332.730us 1 - torch_eager 14.27% 126.064us 99.42% 878.341us 878.341us 0.000us 0.00% 38.496us 38.496us 1 - aten::conv1d 0.64% 5.671us 13.39% 118.255us 39.418us 0.000us 0.00% 20.096us 6.699us 3 - aten::convolution 1.11% 9.840us 12.74% 112.584us 37.528us 0.000us 0.00% 20.096us 6.699us 3 - aten::_convolution 2.79% 24.681us 11.63% 102.744us 34.248us 0.000us 0.00% 20.096us 6.699us 3 - aten::_conv_depthwise2d 2.42% 21.390us 7.02% 62.061us 20.687us 20.096us 55.97% 20.096us 6.699us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.096us 55.97% 20.096us 6.699us 3 - aten::to 0.72% 6.320us 68.61% 606.182us 101.030us 0.000us 0.00% 18.400us 3.067us 6 - aten::_to_copy 2.82% 24.900us 67.90% 599.862us 99.977us 0.000us 0.00% 18.400us 3.067us 6 - aten::copy_ 5.62% 49.645us 61.77% 545.702us 90.950us 15.808us 44.03% 18.400us 3.067us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.448us 23.53% 8.448us 2.816us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.360us 20.50% 7.360us 2.453us 3 - Activity Buffer Request 29.42% 259.919us 29.42% 259.919us 259.919us 2.592us 7.22% 2.592us 2.592us 1 - aten::empty_strided 3.31% 29.260us 3.31% 29.260us 4.877us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 29.15% 257.559us 29.15% 257.559us 28.618us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.02% 17.842us 2.68% 23.662us 2.629us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.05% 9.271us 1.05% 9.271us 0.618us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.19% 10.540us 1.19% 10.540us 3.513us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.99% 8.710us 0.99% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.65% 5.719us 0.80% 7.050us 2.350us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 335.875us 933.82% 335.875us 335.875us 1 + torch_eager 14.86% 122.213us 99.37% 816.978us 816.978us 0.000us 0.00% 38.560us 38.560us 1 + aten::conv1d 0.73% 6.020us 14.48% 119.072us 39.691us 0.000us 0.00% 20.064us 6.688us 3 + aten::convolution 1.17% 9.589us 13.75% 113.052us 37.684us 0.000us 0.00% 20.064us 6.688us 3 + aten::_convolution 2.85% 23.419us 12.58% 103.463us 34.488us 0.000us 0.00% 20.064us 6.688us 3 + aten::_conv_depthwise2d 2.73% 22.441us 7.81% 64.191us 21.397us 20.064us 55.78% 20.064us 6.688us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 20.064us 55.78% 20.064us 6.688us 3 + aten::to 0.74% 6.089us 66.65% 547.971us 91.328us 0.000us 0.00% 18.496us 3.083us 6 + aten::_to_copy 2.81% 23.090us 65.91% 541.882us 90.314us 0.000us 0.00% 18.496us 3.083us 6 + aten::copy_ 6.02% 49.484us 59.30% 487.602us 81.267us 15.904us 44.22% 18.496us 3.083us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.576us 23.84% 8.576us 2.859us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 20.37% 7.328us 2.443us 3 + Activity Buffer Request 27.35% 224.865us 27.35% 224.865us 224.865us 2.592us 7.21% 2.592us 2.592us 1 + aten::empty_strided 3.79% 31.190us 3.79% 31.190us 5.198us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 28.67% 235.693us 28.67% 235.693us 26.188us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.17% 17.820us 2.82% 23.212us 2.579us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.10% 9.012us 1.10% 9.012us 0.601us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.17% 9.620us 1.17% 9.620us 3.207us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.18% 9.690us 1.18% 9.690us 3.230us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.68% 5.572us 0.85% 6.952us 2.317us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 883.481us -Self CUDA time total: 35.904us +Self CPU time total: 822.198us +Self CUDA time total: 35.968us @@ -4526,29 +4526,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D64_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 337.888us 888.80% 337.888us 337.888us 1 - torch_eager 6.31% 128.615us 99.74% 2.033ms 2.033ms 0.000us 0.00% 40.576us 40.576us 1 - aten::conv1d 0.31% 6.349us 5.98% 121.885us 40.628us 0.000us 0.00% 22.304us 7.435us 3 - aten::convolution 0.53% 10.852us 5.67% 115.536us 38.512us 0.000us 0.00% 22.304us 7.435us 3 - aten::_convolution 1.24% 25.291us 5.14% 104.684us 34.895us 0.000us 0.00% 22.304us 7.435us 3 - aten::_conv_depthwise2d 1.08% 22.031us 3.01% 61.431us 20.477us 22.304us 58.67% 22.304us 7.435us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.304us 58.67% 22.304us 7.435us 3 - aten::to 0.34% 6.829us 86.09% 1.755ms 292.477us 0.000us 0.00% 18.272us 3.045us 6 - aten::_to_copy 1.20% 24.424us 85.75% 1.748ms 291.339us 0.000us 0.00% 18.272us 3.045us 6 - aten::copy_ 2.48% 50.501us 83.10% 1.694ms 282.331us 15.712us 41.33% 18.272us 3.045us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.320us 21.89% 8.320us 2.773us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.392us 19.44% 7.392us 2.464us 3 - Activity Buffer Request 69.75% 1.422ms 69.75% 1.422ms 1.422ms 2.560us 6.73% 2.560us 2.560us 1 - aten::empty_strided 1.45% 29.621us 1.45% 29.621us 4.937us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 11.90% 242.506us 11.90% 242.506us 26.945us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.92% 18.701us 1.17% 23.851us 2.650us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.43% 8.710us 0.43% 8.710us 0.581us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.48% 9.800us 0.48% 9.800us 3.267us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.43% 8.710us 0.43% 8.710us 2.903us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.35% 7.191us 0.42% 8.621us 2.874us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 370.623us 978.21% 370.623us 370.623us 1 + torch_eager 6.18% 128.993us 99.74% 2.082ms 2.082ms 0.000us 0.00% 40.448us 40.448us 1 + aten::conv1d 0.30% 6.311us 5.92% 123.493us 41.164us 0.000us 0.00% 22.177us 7.392us 3 + aten::convolution 0.50% 10.340us 5.61% 117.182us 39.061us 0.000us 0.00% 22.177us 7.392us 3 + aten::_convolution 1.15% 24.110us 5.12% 106.842us 35.614us 0.000us 0.00% 22.177us 7.392us 3 + aten::_conv_depthwise2d 1.14% 23.742us 3.15% 65.742us 21.914us 22.177us 58.53% 22.177us 7.392us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 22.177us 58.53% 22.177us 7.392us 3 + aten::to 1.13% 23.681us 86.31% 1.802ms 300.273us 0.000us 0.00% 18.271us 3.045us 6 + aten::_to_copy 1.20% 24.951us 85.17% 1.778ms 296.326us 0.000us 0.00% 18.271us 3.045us 6 + aten::copy_ 2.41% 50.250us 82.40% 1.720ms 286.684us 15.711us 41.47% 18.271us 3.045us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 8.321us 21.96% 8.321us 2.774us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.390us 19.50% 7.390us 2.463us 3 + Activity Buffer Request 69.33% 1.447ms 69.33% 1.447ms 1.447ms 2.560us 6.76% 2.560us 2.560us 1 + aten::empty_strided 1.58% 32.901us 1.58% 32.901us 5.484us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 11.73% 244.945us 11.73% 244.945us 27.216us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.87% 18.191us 1.14% 23.770us 2.641us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.44% 9.210us 0.44% 9.210us 0.614us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.48% 9.930us 0.48% 9.930us 3.310us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.46% 9.680us 0.46% 9.680us 3.227us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.32% 6.640us 0.38% 7.960us 2.653us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 2.038ms -Self CUDA time total: 38.016us +Self CPU time total: 2.088ms +Self CUDA time total: 37.888us @@ -4558,29 +4558,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 362.972us 567.16% 362.972us 362.972us 1 - torch_eager 14.84% 128.544us 99.34% 860.680us 860.680us 0.000us 0.00% 68.061us 68.061us 1 - aten::conv1d 0.70% 6.079us 16.52% 143.165us 47.722us 0.000us 0.00% 41.728us 13.909us 3 - aten::convolution 3.42% 29.613us 15.82% 137.086us 45.695us 0.000us 0.00% 41.728us 13.909us 3 - aten::_convolution 2.86% 24.759us 12.40% 107.473us 35.824us 0.000us 0.00% 41.728us 13.909us 3 - aten::_conv_depthwise2d 2.59% 22.439us 7.67% 66.492us 22.164us 41.728us 65.20% 41.728us 13.909us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.728us 65.20% 41.728us 13.909us 3 - aten::to 0.77% 6.631us 64.71% 560.621us 93.437us 0.000us 0.00% 26.333us 4.389us 6 - aten::_to_copy 2.80% 24.253us 63.94% 553.990us 92.332us 0.000us 0.00% 26.333us 4.389us 6 - aten::copy_ 5.80% 50.240us 57.50% 498.196us 83.033us 22.270us 34.80% 26.333us 4.389us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.903us 18.60% 11.903us 3.968us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.367us 16.20% 10.367us 3.456us 3 - Activity Buffer Request 26.05% 225.728us 26.05% 225.728us 225.728us 4.063us 6.35% 4.063us 4.063us 1 - aten::empty_strided 3.64% 31.541us 3.64% 31.541us 5.257us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 28.31% 245.279us 28.31% 245.279us 27.253us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.11% 18.263us 2.74% 23.752us 2.639us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.06% 9.199us 1.06% 9.199us 0.613us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.26% 10.941us 1.26% 10.941us 3.647us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.16% 10.061us 1.16% 10.061us 3.354us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.66% 5.740us 0.85% 7.330us 2.443us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 339.324us 532.86% 339.324us 339.324us 1 + torch_eager 14.96% 124.364us 99.38% 826.288us 826.288us 0.000us 0.00% 67.776us 67.776us 1 + aten::conv1d 0.74% 6.121us 14.48% 120.383us 40.128us 0.000us 0.00% 41.409us 13.803us 3 + aten::convolution 1.31% 10.850us 13.74% 114.262us 38.087us 0.000us 0.00% 41.409us 13.803us 3 + aten::_convolution 2.80% 23.271us 12.44% 103.412us 34.471us 0.000us 0.00% 41.409us 13.803us 3 + aten::_conv_depthwise2d 2.86% 23.801us 7.77% 64.610us 21.537us 41.409us 65.03% 41.409us 13.803us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 41.409us 65.03% 41.409us 13.803us 3 + aten::to 0.72% 5.961us 66.69% 554.441us 92.407us 0.000us 0.00% 26.367us 4.395us 6 + aten::_to_copy 2.84% 23.608us 65.97% 548.480us 91.413us 0.000us 0.00% 26.367us 4.395us 6 + aten::copy_ 6.26% 52.010us 59.41% 493.909us 82.318us 22.271us 34.97% 26.367us 4.395us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.935us 18.74% 11.935us 3.978us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.336us 16.23% 10.336us 3.445us 3 + Activity Buffer Request 27.47% 228.425us 27.47% 228.425us 228.425us 4.096us 6.43% 4.096us 4.096us 1 + aten::empty_strided 3.72% 30.963us 3.72% 30.963us 5.160us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 28.31% 235.354us 28.31% 235.354us 26.150us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.06% 17.130us 2.64% 21.981us 2.442us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.00% 8.352us 1.00% 8.352us 0.557us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.18% 9.829us 1.18% 9.829us 3.276us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.09% 9.100us 1.09% 9.100us 3.033us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.71% 5.910us 0.88% 7.280us 2.427us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 866.380us -Self CUDA time total: 63.998us +Self CPU time total: 831.408us +Self CUDA time total: 63.680us @@ -4590,29 +4590,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S128_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 357.311us 512.91% 357.311us 357.311us 1 - torch_eager 20.96% 191.619us 99.38% 908.662us 908.662us 0.000us 0.00% 73.696us 73.696us 1 - aten::conv1d 0.63% 5.760us 15.23% 139.294us 46.431us 0.000us 0.00% 47.296us 15.765us 3 - aten::convolution 2.87% 26.271us 14.60% 133.534us 44.511us 0.000us 0.00% 47.296us 15.765us 3 - aten::_convolution 2.77% 25.360us 11.73% 107.263us 35.754us 0.000us 0.00% 47.296us 15.765us 3 - aten::_conv_depthwise2d 2.38% 21.722us 7.17% 65.523us 21.841us 47.296us 67.89% 47.296us 15.765us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.296us 67.89% 47.296us 15.765us 3 - aten::to 0.73% 6.650us 60.08% 549.318us 91.553us 0.000us 0.00% 26.400us 4.400us 6 - aten::_to_copy 2.63% 24.032us 59.35% 542.668us 90.445us 0.000us 0.00% 26.400us 4.400us 6 - aten::copy_ 5.57% 50.922us 53.46% 488.786us 81.464us 22.368us 32.11% 26.400us 4.400us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.872us 17.04% 11.872us 3.957us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.496us 15.07% 10.496us 3.499us 3 - Activity Buffer Request 23.91% 218.617us 23.91% 218.617us 218.617us 4.032us 5.79% 4.032us 4.032us 1 - aten::empty_strided 3.26% 29.850us 3.26% 29.850us 4.975us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 26.57% 242.937us 26.57% 242.937us 26.993us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.04% 18.652us 2.65% 24.251us 2.695us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.01% 9.230us 1.01% 9.230us 0.615us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.08% 9.870us 1.08% 9.870us 3.290us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.12% 10.241us 1.12% 10.241us 3.414us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.63% 5.780us 0.80% 7.270us 2.423us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 341.884us 492.57% 341.884us 341.884us 1 + torch_eager 14.66% 124.263us 99.38% 842.608us 842.608us 0.000us 0.00% 73.472us 73.472us 1 + aten::conv1d 0.69% 5.810us 14.06% 119.183us 39.728us 0.000us 0.00% 47.072us 15.691us 3 + aten::convolution 1.10% 9.331us 13.37% 113.373us 37.791us 0.000us 0.00% 47.072us 15.691us 3 + aten::_convolution 2.98% 25.231us 12.27% 104.042us 34.681us 0.000us 0.00% 47.072us 15.691us 3 + aten::_conv_depthwise2d 2.57% 21.770us 7.47% 63.341us 21.114us 47.072us 67.82% 47.072us 15.691us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 47.072us 67.82% 47.072us 15.691us 3 + aten::to 0.71% 6.042us 67.35% 571.062us 95.177us 0.000us 0.00% 26.400us 4.400us 6 + aten::_to_copy 2.91% 24.658us 66.64% 565.020us 94.170us 0.000us 0.00% 26.400us 4.400us 6 + aten::copy_ 5.98% 50.742us 60.23% 510.651us 85.108us 22.336us 32.18% 26.400us 4.400us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.967us 17.24% 11.967us 3.989us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.369us 14.94% 10.369us 3.456us 3 + Activity Buffer Request 28.69% 243.255us 28.69% 243.255us 243.255us 4.064us 5.86% 4.064us 4.064us 1 + aten::empty_strided 3.50% 29.711us 3.50% 29.711us 4.952us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 28.24% 239.475us 28.24% 239.475us 26.608us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.11% 17.861us 2.71% 22.969us 2.552us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.01% 8.598us 1.01% 8.598us 0.573us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.13% 9.580us 1.13% 9.580us 3.193us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.08% 9.170us 1.08% 9.170us 3.057us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.70% 5.911us 0.85% 7.210us 2.403us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 914.323us -Self CUDA time total: 69.664us +Self CPU time total: 847.859us +Self CUDA time total: 69.408us @@ -4622,29 +4622,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.092us 187.26% 348.092us 348.092us 1 - torch_eager 14.76% 124.374us 99.29% 836.558us 836.558us 0.000us 0.00% 195.870us 195.870us 1 - aten::conv1d 0.70% 5.900us 14.42% 121.504us 40.501us 0.000us 0.00% 133.406us 44.469us 3 - aten::convolution 1.14% 9.610us 13.72% 115.604us 38.535us 0.000us 0.00% 133.406us 44.469us 3 - aten::_convolution 2.88% 24.263us 12.58% 105.994us 35.331us 0.000us 0.00% 133.406us 44.469us 3 - aten::_conv_depthwise2d 2.73% 23.010us 7.80% 65.750us 21.917us 133.406us 71.77% 133.406us 44.469us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 133.406us 71.77% 133.406us 44.469us 3 - aten::to 0.74% 6.220us 66.83% 563.060us 93.843us 0.000us 0.00% 62.464us 10.411us 6 - aten::_to_copy 2.83% 23.861us 66.09% 556.840us 92.807us 0.000us 0.00% 62.464us 10.411us 6 - aten::copy_ 6.03% 50.810us 59.73% 503.287us 83.881us 52.480us 28.23% 62.464us 10.411us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.600us 15.92% 29.600us 9.867us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 12.31% 22.880us 7.627us 3 - Activity Buffer Request 25.69% 216.468us 25.69% 216.468us 216.468us 9.984us 5.37% 9.984us 9.984us 1 - aten::empty_strided 3.52% 29.692us 3.52% 29.692us 4.949us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 30.59% 257.739us 30.59% 257.739us 28.638us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.08% 17.540us 2.73% 23.000us 2.556us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.12% 9.412us 1.12% 9.412us 0.627us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.20% 10.110us 1.20% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.29% 10.900us 1.29% 10.900us 3.633us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.68% 5.719us 0.88% 7.451us 2.484us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 350.870us 189.71% 350.870us 350.870us 1 + torch_eager 14.64% 124.941us 99.33% 847.778us 847.778us 0.000us 0.00% 194.907us 194.907us 1 + aten::conv1d 0.69% 5.931us 14.37% 122.643us 40.881us 0.000us 0.00% 132.732us 44.244us 3 + aten::convolution 1.16% 9.910us 13.67% 116.712us 38.904us 0.000us 0.00% 132.732us 44.244us 3 + aten::_convolution 2.94% 25.098us 12.51% 106.802us 35.601us 0.000us 0.00% 132.732us 44.244us 3 + aten::_conv_depthwise2d 2.63% 22.470us 7.66% 65.342us 21.781us 132.732us 71.76% 132.732us 44.244us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 132.732us 71.76% 132.732us 44.244us 3 + aten::to 0.71% 6.042us 67.13% 572.943us 95.490us 0.000us 0.00% 62.175us 10.362us 6 + aten::_to_copy 2.75% 23.470us 66.42% 566.901us 94.484us 0.000us 0.00% 62.175us 10.362us 6 + aten::copy_ 6.00% 51.182us 60.05% 512.571us 85.428us 52.223us 28.24% 62.175us 10.362us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.343us 15.86% 29.343us 9.781us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.880us 12.37% 22.880us 7.627us 3 + Activity Buffer Request 29.33% 250.295us 29.33% 250.295us 250.295us 9.952us 5.38% 9.952us 9.952us 1 + aten::empty_strided 3.62% 30.860us 3.62% 30.860us 5.143us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 27.39% 233.736us 27.39% 233.736us 25.971us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.08% 17.752us 2.70% 23.071us 2.563us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.06% 9.008us 1.06% 9.008us 0.601us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.27% 10.820us 1.27% 10.820us 3.607us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.10% 9.410us 1.10% 9.410us 3.137us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.72% 6.112us 0.88% 7.511us 2.504us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 842.539us -Self CUDA time total: 185.886us +Self CPU time total: 853.509us +Self CUDA time total: 184.955us @@ -4654,29 +4654,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S512_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 348.403us 166.18% 348.403us 348.403us 1 - torch_eager 14.60% 122.924us 99.33% 836.209us 836.209us 0.000us 0.00% 223.383us 223.383us 1 - aten::conv1d 0.69% 5.779us 14.01% 117.955us 39.318us 0.000us 0.00% 153.883us 51.294us 3 - aten::convolution 1.25% 10.491us 13.32% 112.176us 37.392us 0.000us 0.00% 153.883us 51.294us 3 - aten::_convolution 2.91% 24.484us 12.08% 101.685us 33.895us 0.000us 0.00% 153.883us 51.294us 3 - aten::_conv_depthwise2d 2.49% 20.928us 7.14% 60.070us 20.023us 153.883us 73.40% 153.883us 51.294us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.883us 73.40% 153.883us 51.294us 3 - aten::to 0.73% 6.179us 67.37% 567.200us 94.533us 0.000us 0.00% 69.500us 11.583us 6 - aten::_to_copy 2.75% 23.132us 66.64% 561.021us 93.504us 0.000us 0.00% 69.500us 11.583us 6 - aten::copy_ 5.91% 49.740us 60.39% 508.377us 84.729us 55.773us 26.60% 69.500us 11.583us 6 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 32.927us 15.71% 32.927us 10.976us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.846us 10.90% 22.846us 7.615us 3 - Activity Buffer Request 29.09% 244.869us 29.09% 244.869us 244.869us 13.727us 6.55% 13.727us 13.727us 1 - aten::empty_strided 3.51% 29.512us 3.51% 29.512us 4.919us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 27.84% 234.420us 27.84% 234.420us 26.047us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 2.13% 17.973us 2.77% 23.320us 2.591us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 1.09% 9.167us 1.09% 9.167us 0.611us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 1.12% 9.440us 1.12% 9.440us 3.147us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 1.07% 9.050us 1.07% 9.050us 3.017us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.85% 7.121us 1.02% 8.601us 2.867us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 355.227us 169.45% 355.227us 355.227us 1 + torch_eager 15.18% 123.692us 99.29% 808.918us 808.918us 0.000us 0.00% 223.518us 223.518us 1 + aten::conv1d 0.72% 5.860us 14.71% 119.853us 39.951us 0.000us 0.00% 153.470us 51.157us 3 + aten::convolution 1.17% 9.541us 13.99% 113.993us 37.998us 0.000us 0.00% 153.470us 51.157us 3 + aten::_convolution 3.03% 24.710us 12.82% 104.452us 34.817us 0.000us 0.00% 153.470us 51.157us 3 + aten::_conv_depthwise2d 2.76% 22.461us 7.85% 63.951us 21.317us 153.470us 73.21% 153.470us 51.157us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 153.470us 73.21% 153.470us 51.157us 3 + aten::to 0.75% 6.140us 65.95% 537.281us 89.547us 0.000us 0.00% 70.048us 11.675us 6 + aten::_to_copy 2.84% 23.150us 65.19% 531.141us 88.524us 0.000us 0.00% 70.048us 11.675us 6 + aten::copy_ 6.47% 52.731us 58.48% 476.471us 79.412us 56.160us 26.79% 70.048us 11.675us 6 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 33.184us 15.83% 33.184us 11.061us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.976us 10.96% 22.976us 7.659us 3 + Activity Buffer Request 26.55% 216.325us 26.55% 216.325us 216.325us 13.888us 6.63% 13.888us 13.888us 1 + aten::empty_strided 3.87% 31.520us 3.87% 31.520us 5.253us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 28.13% 229.215us 28.13% 229.215us 25.468us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 2.20% 17.931us 2.85% 23.181us 2.576us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 1.08% 8.800us 1.08% 8.800us 0.587us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 1.20% 9.790us 1.20% 9.790us 3.263us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 1.22% 9.900us 1.22% 9.900us 3.300us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.73% 5.980us 0.90% 7.360us 2.453us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 841.880us -Self CUDA time total: 209.656us +Self CPU time total: 814.738us +Self CUDA time total: 209.630us @@ -4686,29 +4686,29 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W2 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 7.22% 135.785us 57.39% 1.079ms 1.079ms 0.000us 0.00% 1.518ms 1.518ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.419ms 100.41% 1.419ms 1.419ms 1 - aten::to 0.37% 6.901us 40.86% 768.526us 128.088us 0.000us 0.00% 823.221us 137.204us 6 - aten::_to_copy 1.63% 30.742us 40.49% 761.625us 126.938us 0.000us 0.00% 823.221us 137.204us 6 - aten::copy_ 2.94% 55.302us 27.81% 523.157us 87.193us 717.942us 50.81% 823.221us 137.204us 6 - aten::conv1d 0.33% 6.280us 6.71% 126.144us 42.048us 0.000us 0.00% 695.094us 231.698us 3 - aten::convolution 0.57% 10.750us 6.37% 119.864us 39.955us 0.000us 0.00% 695.094us 231.698us 3 - aten::_convolution 1.35% 25.400us 5.80% 109.114us 36.371us 0.000us 0.00% 695.094us 231.698us 3 - aten::_conv_depthwise2d 1.19% 22.332us 3.55% 66.763us 22.254us 695.094us 49.19% 695.094us 231.698us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 695.094us 49.19% 695.094us 231.698us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 411.706us 29.14% 411.706us 137.235us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 306.236us 21.67% 306.236us 102.079us 3 - Activity Buffer Request 12.99% 244.238us 12.99% 244.238us 244.238us 105.279us 7.45% 105.279us 105.279us 1 - aten::empty_strided 2.17% 40.811us 11.04% 207.726us 34.621us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 13.13% 246.997us 13.13% 246.997us 27.444us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 1.97% 37.133us 2.36% 44.413us 4.935us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.58% 10.889us 0.58% 10.889us 0.726us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.53% 10.051us 0.53% 10.051us 3.350us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.58% 11.000us 0.58% 11.000us 3.667us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.34% 6.350us 0.41% 7.700us 2.567us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 6.92% 128.362us 54.16% 1.005ms 1.005ms 0.000us 0.00% 1.522ms 1.522ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.421ms 100.42% 1.421ms 1.421ms 1 + aten::to 0.36% 6.621us 38.84% 720.727us 120.121us 0.000us 0.00% 826.557us 137.760us 6 + aten::_to_copy 1.58% 29.231us 38.49% 714.106us 119.018us 0.000us 0.00% 826.557us 137.760us 6 + aten::copy_ 2.91% 54.020us 26.66% 494.611us 82.435us 719.869us 50.86% 826.557us 137.760us 6 + aten::conv1d 0.33% 6.200us 6.83% 126.803us 42.268us 0.000us 0.00% 695.450us 231.817us 3 + aten::convolution 0.54% 10.000us 6.50% 120.603us 40.201us 0.000us 0.00% 695.450us 231.817us 3 + aten::_convolution 1.37% 25.370us 5.96% 110.603us 36.868us 0.000us 0.00% 695.450us 231.817us 3 + aten::_conv_depthwise2d 1.27% 23.500us 3.66% 67.942us 22.647us 695.450us 49.14% 695.450us 231.817us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 695.450us 49.14% 695.450us 231.817us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 408.829us 28.89% 408.829us 136.276us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 311.040us 21.98% 311.040us 103.680us 3 + Activity Buffer Request 12.21% 226.485us 12.21% 226.485us 226.485us 106.688us 7.54% 106.688us 106.688us 1 + aten::empty_strided 2.00% 37.161us 10.25% 190.264us 31.711us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 12.87% 238.737us 12.87% 238.737us 26.526us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.97% 18.050us 1.30% 24.121us 2.680us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.54% 9.951us 0.54% 9.951us 0.663us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.54% 10.110us 0.54% 10.110us 3.370us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.52% 9.701us 0.52% 9.701us 3.234us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.37% 6.860us 0.45% 8.350us 2.783us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 1.881ms -Self CUDA time total: 1.413ms +Self CPU time total: 1.855ms +Self CUDA time total: 1.415ms @@ -4718,61 +4718,61 @@ PROFILE TRACE: torch_eager | cuda_B4_D2048_S2048_W4 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - torch_eager 4.25% 132.984us 66.63% 2.083ms 2.083ms 0.000us 0.00% 1.503ms 1.503ms 1 - torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.434ms 100.41% 1.434ms 1.434ms 1 - aten::to 0.21% 6.470us 57.53% 1.798ms 299.656us 0.000us 0.00% 765.147us 127.524us 6 - aten::_to_copy 0.80% 25.009us 57.32% 1.791ms 298.577us 0.000us 0.00% 765.147us 127.524us 6 - aten::copy_ 1.51% 47.155us 55.55% 1.736ms 289.360us 690.492us 48.35% 765.147us 127.524us 6 - aten::conv1d 0.20% 6.231us 3.91% 122.325us 40.775us 0.000us 0.00% 737.724us 245.908us 3 - aten::convolution 0.32% 9.920us 3.71% 116.094us 38.698us 0.000us 0.00% 737.724us 245.908us 3 - aten::_convolution 0.82% 25.623us 3.40% 106.174us 35.391us 0.000us 0.00% 737.724us 245.908us 3 - aten::_conv_depthwise2d 0.70% 21.899us 1.98% 62.011us 20.670us 737.724us 51.65% 737.724us 245.908us 3 -void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 737.724us 51.65% 737.724us 245.908us 3 -void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 398.046us 27.87% 398.046us 132.682us 3 -void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 292.446us 20.48% 292.446us 97.482us 3 - Activity Buffer Request 47.19% 1.475ms 47.19% 1.475ms 1.475ms 74.655us 5.23% 74.655us 74.655us 1 - aten::empty_strided 0.97% 30.293us 0.97% 30.293us 5.049us 0.000us 0.00% 0.000us 0.000us 6 - cudaLaunchKernel 7.52% 235.026us 7.52% 235.026us 26.114us 0.000us 0.00% 0.000us 0.000us 9 - aten::unsqueeze 0.60% 18.740us 0.79% 24.820us 2.758us 0.000us 0.00% 0.000us 0.000us 9 - aten::as_strided 0.32% 10.019us 0.32% 10.019us 0.668us 0.000us 0.00% 0.000us 0.000us 15 - aten::empty 0.32% 9.882us 0.32% 9.882us 3.294us 0.000us 0.00% 0.000us 0.000us 3 - aten::resize_ 0.29% 9.220us 0.29% 9.220us 3.073us 0.000us 0.00% 0.000us 0.000us 3 - aten::squeeze 0.24% 7.471us 0.29% 9.160us 3.053us 0.000us 0.00% 0.000us 0.000us 3 + torch_eager 4.16% 128.483us 66.51% 2.056ms 2.056ms 0.000us 0.00% 1.499ms 1.499ms 1 + torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 1.430ms 100.42% 1.430ms 1.430ms 1 + aten::to 0.21% 6.492us 57.40% 1.775ms 295.822us 0.000us 0.00% 760.863us 126.811us 6 + aten::_to_copy 0.82% 25.449us 57.19% 1.768ms 294.739us 0.000us 0.00% 760.863us 126.811us 6 + aten::copy_ 1.66% 51.471us 55.36% 1.712ms 285.278us 686.079us 48.17% 760.863us 126.811us 6 + aten::conv1d 0.22% 6.820us 4.02% 124.423us 41.474us 0.000us 0.00% 738.336us 246.112us 3 + aten::convolution 0.33% 10.111us 3.80% 117.603us 39.201us 0.000us 0.00% 738.336us 246.112us 3 + aten::_convolution 0.82% 25.320us 3.48% 107.492us 35.831us 0.000us 0.00% 738.336us 246.112us 3 + aten::_conv_depthwise2d 0.75% 23.320us 2.10% 65.022us 21.674us 738.336us 51.83% 738.336us 246.112us 3 +void at::native::(anonymous namespace)::conv_depthwi... 0.00% 0.000us 0.00% 0.000us 0.000us 738.336us 51.83% 738.336us 246.112us 3 +void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 395.071us 27.74% 395.071us 131.690us 3 +void at::native::unrolled_elementwise_kernel<at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 291.008us 20.43% 291.008us 97.003us 3 + Activity Buffer Request 46.92% 1.451ms 46.92% 1.451ms 1.451ms 74.784us 5.25% 74.784us 74.784us 1 + aten::empty_strided 1.01% 31.321us 1.01% 31.321us 5.220us 0.000us 0.00% 0.000us 0.000us 6 + cudaLaunchKernel 7.49% 231.634us 7.49% 231.634us 25.737us 0.000us 0.00% 0.000us 0.000us 9 + aten::unsqueeze 0.61% 18.861us 0.79% 24.350us 2.706us 0.000us 0.00% 0.000us 0.000us 9 + aten::as_strided 0.29% 9.099us 0.29% 9.099us 0.607us 0.000us 0.00% 0.000us 0.000us 15 + aten::empty 0.32% 9.981us 0.32% 9.981us 3.327us 0.000us 0.00% 0.000us 0.000us 3 + aten::resize_ 0.31% 9.461us 0.31% 9.461us 3.154us 0.000us 0.00% 0.000us 0.000us 3 + aten::squeeze 0.20% 6.260us 0.25% 7.650us 2.550us 0.000us 0.00% 0.000us 0.000us 3 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 3.125ms -Self CUDA time total: 1.428ms +Self CPU time total: 3.092ms +Self CUDA time total: 1.424ms impl wl p50(ms) ok -torch_eager cuda_B2_D2048_S128_W2 0.08 True +torch_eager cuda_B2_D2048_S128_W2 0.09 True torch_eager cuda_B2_D2048_S128_W4 0.09 True -torch_eager cuda_B2_D2048_S2048_W2 0.15 True +torch_eager cuda_B2_D2048_S2048_W2 0.14 True torch_eager cuda_B2_D2048_S2048_W4 0.16 True -torch_eager cuda_B2_D2048_S512_W2 0.08 True -torch_eager cuda_B2_D2048_S512_W4 0.08 True +torch_eager cuda_B2_D2048_S512_W2 0.09 True +torch_eager cuda_B2_D2048_S512_W4 0.09 True torch_eager cuda_B2_D64_S128_W2 0.07 True torch_eager cuda_B2_D64_S128_W4 0.09 True torch_eager cuda_B2_D64_S2048_W2 0.09 True -torch_eager cuda_B2_D64_S2048_W4 0.08 True +torch_eager cuda_B2_D64_S2048_W4 0.09 True torch_eager cuda_B2_D64_S512_W2 0.09 True torch_eager cuda_B2_D64_S512_W4 0.09 True torch_eager cuda_B4_D2048_S128_W2 0.09 True -torch_eager cuda_B4_D2048_S128_W4 0.08 True +torch_eager cuda_B4_D2048_S128_W4 0.09 True torch_eager cuda_B4_D2048_S2048_W2 0.49 True torch_eager cuda_B4_D2048_S2048_W4 0.50 True -torch_eager cuda_B4_D2048_S512_W2 0.09 True +torch_eager cuda_B4_D2048_S512_W2 0.10 True torch_eager cuda_B4_D2048_S512_W4 0.10 True -torch_eager cuda_B4_D64_S128_W2 0.08 True -torch_eager cuda_B4_D64_S128_W4 0.08 True -torch_eager cuda_B4_D64_S2048_W2 0.08 True +torch_eager cuda_B4_D64_S128_W2 0.09 True +torch_eager cuda_B4_D64_S128_W4 0.09 True +torch_eager cuda_B4_D64_S2048_W2 0.09 True torch_eager cuda_B4_D64_S2048_W4 0.09 True -torch_eager cuda_B4_D64_S512_W2 0.08 True -torch_eager cuda_B4_D64_S512_W4 0.08 True +torch_eager cuda_B4_D64_S512_W2 0.09 True +torch_eager cuda_B4_D64_S512_W4 0.09 True▶ UV Install Logs