Running activation benchmark on cuda with 9 workloads.
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 208.672us 1646.72% 208.672us 208.672us 1
torch_eager 11.52% 217.973us 99.62% 1.885ms 1.885ms 0.000us 0.00% 14.976us 14.976us 1
aten::silu 3.07% 58.081us 81.78% 1.547ms 515.694us 6.464us 51.01% 8.768us 2.923us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 51.01% 6.464us 2.155us 3
aten::mul 1.91% 36.092us 3.28% 62.082us 20.694us 6.208us 48.99% 6.208us 2.069us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.208us 48.99% 6.208us 2.069us 3
Activity Buffer Request 76.33% 1.444ms 76.33% 1.444ms 1.444ms 2.304us 18.18% 2.304us 2.304us 1
aten::slice 2.46% 46.622us 3.04% 57.552us 9.592us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.58% 10.930us 0.58% 10.930us 1.822us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 3.75% 71.021us 3.75% 71.021us 11.837us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.38% 7.160us 0.38% 7.160us 7.160us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.892ms
Self CUDA time total: 12.672us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 189.724us 1532.13% 189.724us 189.724us 1
torch_eager 7.75% 136.545us 99.70% 1.756ms 1.756ms 0.000us 0.00% 14.559us 14.559us 1
aten::silu 2.47% 43.560us 85.85% 1.512ms 503.984us 6.399us 51.68% 8.575us 2.858us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
aten::mul 2.87% 50.460us 4.18% 73.560us 24.520us 5.984us 48.32% 5.984us 1.995us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.984us 48.32% 5.984us 1.995us 3
Activity Buffer Request 81.76% 1.440ms 81.76% 1.440ms 1.440ms 2.176us 17.57% 2.176us 2.176us 1
aten::slice 1.56% 27.471us 1.92% 33.791us 5.632us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.36% 6.320us 0.36% 6.320us 1.053us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.93% 51.591us 2.93% 51.591us 8.598us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.30% 5.280us 0.30% 5.280us 5.280us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.761ms
Self CUDA time total: 12.383us
======================================================================
PROFILE TRACE: torch_eager | cuda_T128_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.261us 1202.24% 159.261us 159.261us 1
torch_eager 7.59% 133.144us 99.70% 1.749ms 1.749ms 0.000us 0.00% 15.487us 15.487us 1
aten::silu 2.45% 42.980us 87.40% 1.533ms 511.158us 6.783us 51.20% 9.023us 3.008us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.783us 51.20% 6.783us 2.261us 3
aten::mul 1.60% 28.151us 2.82% 49.551us 16.517us 6.464us 48.80% 6.464us 2.155us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.464us 48.80% 6.464us 2.155us 3
Activity Buffer Request 83.38% 1.463ms 83.38% 1.463ms 1.463ms 2.240us 16.91% 2.240us 2.240us 1
aten::slice 1.54% 26.990us 1.89% 33.190us 5.532us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.35% 6.200us 0.35% 6.200us 1.033us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 2.79% 48.992us 2.79% 48.992us 8.165us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.30% 5.190us 0.30% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.755ms
Self CUDA time total: 13.247us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.719us 1246.22% 158.719us 158.719us 1
torch_eager 6.58% 125.161us 99.76% 1.897ms 1.897ms 0.000us 0.00% 14.944us 14.944us 1
aten::silu 2.27% 43.111us 89.01% 1.692ms 564.032us 6.560us 51.51% 8.768us 2.923us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
aten::mul 1.36% 25.870us 2.47% 46.950us 15.650us 6.176us 48.49% 6.176us 2.059us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
Activity Buffer Request 75.60% 1.437ms 75.60% 1.437ms 1.437ms 2.208us 17.34% 2.208us 2.208us 1
aten::slice 1.39% 26.382us 1.70% 32.293us 5.382us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.31% 5.911us 0.31% 5.911us 0.985us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 12.25% 232.925us 12.25% 232.925us 38.821us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.24% 4.510us 0.24% 4.510us 4.510us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.901ms
Self CUDA time total: 12.736us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 153.887us 1158.61% 153.887us 153.887us 1
torch_eager 6.96% 128.034us 99.73% 1.834ms 1.834ms 0.000us 0.00% 15.586us 15.586us 1
aten::silu 2.31% 42.562us 88.63% 1.630ms 543.305us 6.849us 51.57% 9.153us 3.051us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.849us 51.57% 6.849us 2.283us 3
aten::mul 1.46% 26.931us 2.44% 44.851us 14.950us 6.433us 48.43% 6.433us 2.144us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.433us 48.43% 6.433us 2.144us 3
Activity Buffer Request 77.32% 1.422ms 77.32% 1.422ms 1.422ms 2.304us 17.35% 2.304us 2.304us 1
aten::slice 1.36% 24.939us 1.70% 31.240us 5.207us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.34% 6.301us 0.34% 6.301us 1.050us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.97% 183.363us 9.97% 183.363us 30.561us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.27% 4.900us 0.27% 4.900us 4.900us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.839ms
Self CUDA time total: 13.282us
======================================================================
PROFILE TRACE: torch_eager | cuda_T256_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 157.278us 1009.23% 157.278us 157.278us 1
torch_eager 8.12% 150.915us 99.71% 1.854ms 1.854ms 0.000us 0.00% 18.272us 18.272us 1
aten::silu 2.38% 44.260us 87.35% 1.624ms 541.305us 8.000us 51.33% 10.688us 3.563us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 51.33% 8.000us 2.667us 3
aten::mul 1.41% 26.151us 2.51% 46.701us 15.567us 7.584us 48.67% 7.584us 2.528us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.67% 7.584us 2.528us 3
Activity Buffer Request 76.39% 1.420ms 76.39% 1.420ms 1.420ms 2.688us 17.25% 2.688us 2.688us 1
aten::slice 1.39% 25.840us 1.73% 32.160us 5.360us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.34% 6.320us 0.34% 6.320us 1.053us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.68% 179.994us 9.68% 179.994us 29.999us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.29% 5.351us 0.29% 5.351us 5.351us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.859ms
Self CUDA time total: 15.584us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D768
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 161.728us 1130.65% 161.728us 161.728us 1
torch_eager 7.31% 130.302us 99.73% 1.777ms 1.777ms 0.000us 0.00% 16.768us 16.768us 1
aten::silu 2.39% 42.651us 87.87% 1.566ms 521.901us 7.328us 51.23% 9.792us 3.264us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3
aten::mul 1.55% 27.651us 2.68% 47.751us 15.917us 6.976us 48.77% 6.976us 2.325us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
Activity Buffer Request 76.61% 1.365ms 76.61% 1.365ms 1.365ms 2.464us 17.23% 2.464us 2.464us 1
aten::slice 1.50% 26.642us 1.87% 33.262us 5.544us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.37% 6.620us 0.37% 6.620us 1.103us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.99% 177.974us 9.99% 177.974us 29.662us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.27% 4.870us 0.27% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.782ms
Self CUDA time total: 14.304us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D1024
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.416us 1005.63% 156.416us 156.416us 1
torch_eager 7.17% 130.703us 99.74% 1.819ms 1.819ms 0.000us 0.00% 18.243us 18.243us 1
aten::silu 2.30% 42.032us 88.31% 1.611ms 536.959us 7.970us 51.24% 10.659us 3.553us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.970us 51.24% 7.970us 2.657us 3
aten::mul 1.41% 25.800us 2.54% 46.410us 15.470us 7.584us 48.76% 7.584us 2.528us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.76% 7.584us 2.528us 3
Activity Buffer Request 77.37% 1.411ms 77.37% 1.411ms 1.411ms 2.689us 17.29% 2.689us 2.689us 1
aten::slice 1.41% 25.640us 1.72% 31.370us 5.228us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.31% 5.730us 0.31% 5.730us 0.955us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.77% 178.145us 9.77% 178.145us 29.691us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.26% 4.790us 0.26% 4.790us 4.790us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.824ms
Self CUDA time total: 15.554us
======================================================================
PROFILE TRACE: torch_eager | cuda_T512_D2048
======================================================================
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.390us 709.54% 159.390us 159.390us 1
torch_eager 6.97% 127.342us 99.74% 1.823ms 1.823ms 0.000us 0.00% 26.336us 26.336us 1
aten::silu 2.35% 42.870us 88.50% 1.617ms 539.138us 11.520us 51.28% 15.392us 5.131us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.520us 51.28% 11.520us 3.840us 3
aten::mul 1.55% 28.251us 2.57% 47.051us 15.684us 10.944us 48.72% 10.944us 3.648us 3
void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.72% 10.944us 3.648us 3
Activity Buffer Request 77.70% 1.420ms 77.70% 1.420ms 1.420ms 3.872us 17.24% 3.872us 3.872us 1
aten::slice 1.38% 25.151us 1.70% 31.112us 5.185us 0.000us 0.00% 0.000us 0.000us 6
aten::as_strided 0.33% 5.961us 0.33% 5.961us 0.993us 0.000us 0.00% 0.000us 0.000us 6
cudaLaunchKernel 9.48% 173.263us 9.48% 173.263us 28.877us 0.000us 0.00% 0.000us 0.000us 6
cudaDeviceSynchronize 0.26% 4.721us 0.26% 4.721us 4.721us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.828ms
Self CUDA time total: 22.464us
impl wl p50(ms) ok
torch_eager cuda_T128_D1024 0.05 True
torch_eager cuda_T128_D2048 0.05 True
torch_eager cuda_T128_D768 0.04 True
torch_eager cuda_T256_D1024 0.05 True
torch_eager cuda_T256_D2048 0.05 True
torch_eager cuda_T256_D768 0.05 True
torch_eager cuda_T512_D1024 0.05 True
torch_eager cuda_T512_D2048 0.05 True
torch_eager cuda_T512_D768 0.05 True