Upload folder using huggingface_hub
Browse files- flash_attn/benchmark.html +79 -79
- moe_benchmarks/megablocks/cells/forward_and_backward_no_kernel.py +196 -0
- moe_benchmarks/megablocks/megablocks_only.html +315 -126
- moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png +2 -2
- moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc +0 -0
- moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc +0 -0
- moe_benchmarks/megablocks_yamoe/cells/setup2.py +115 -0
- moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html +130 -131
- moe_benchmarks/megablocks_yamoe/torch_profile.html +206 -206
flash_attn/benchmark.html
CHANGED
|
@@ -3722,7 +3722,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3722 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
-
Cell: benchmark |
|
| 3726 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3728 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4429,105 +4429,105 @@ xFormers not found.
|
|
| 4429 |
|
| 4430 |
|
| 4431 |
===== Testing shape: (1, 4224, 24, 128) =====
|
| 4432 |
-
torch_cudnn : absmax=0.
|
| 4433 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4434 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4435 |
-
torch_flash : absmax=0.
|
| 4436 |
-
torch_flash_compile_d : absmax=0.
|
| 4437 |
-
torch_flash_compile_ma : absmax=0.
|
| 4438 |
-
hf_flash_attn : absmax=0.
|
| 4439 |
-
hf_flash_attn3 : absmax=0.
|
| 4440 |
|
| 4441 |
|
| 4442 |
===== Testing shape: (1, 4352, 24, 128) =====
|
| 4443 |
-
torch_cudnn : absmax=0.
|
| 4444 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4445 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4446 |
-
torch_flash : absmax=0.
|
| 4447 |
-
torch_flash_compile_d : absmax=0.
|
| 4448 |
-
torch_flash_compile_ma : absmax=0.
|
| 4449 |
-
hf_flash_attn : absmax=0.
|
| 4450 |
-
hf_flash_attn3 : absmax=0.
|
| 4451 |
|
| 4452 |
|
| 4453 |
===== Testing shape: (1, 4416, 24, 128) =====
|
| 4454 |
-
torch_cudnn : absmax=0.
|
| 4455 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4456 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4457 |
-
torch_flash : absmax=0.
|
| 4458 |
-
torch_flash_compile_d : absmax=0.
|
| 4459 |
-
torch_flash_compile_ma : absmax=0.
|
| 4460 |
-
hf_flash_attn : absmax=0.
|
| 4461 |
-
hf_flash_attn3 : absmax=0.
|
| 4462 |
|
| 4463 |
|
| 4464 |
===== Testing shape: (1, 4480, 24, 128) =====
|
| 4465 |
-
torch_cudnn : absmax=0.
|
| 4466 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4467 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4468 |
-
torch_flash : absmax=0.
|
| 4469 |
-
torch_flash_compile_d : absmax=0.
|
| 4470 |
-
torch_flash_compile_ma : absmax=0.
|
| 4471 |
-
hf_flash_attn : absmax=0.
|
| 4472 |
-
hf_flash_attn3 : absmax=0.
|
| 4473 |
|
| 4474 |
|
| 4475 |
===== Testing shape: (1, 4544, 24, 128) =====
|
| 4476 |
-
torch_cudnn : absmax=0.
|
| 4477 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4478 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4479 |
-
torch_flash : absmax=0.
|
| 4480 |
-
torch_flash_compile_d : absmax=0.
|
| 4481 |
-
torch_flash_compile_ma : absmax=0.
|
| 4482 |
-
hf_flash_attn : absmax=0.
|
| 4483 |
-
hf_flash_attn3 : absmax=0.
|
| 4484 |
|
| 4485 |
|
| 4486 |
===== Testing shape: (1, 4608, 24, 128) =====
|
| 4487 |
-
torch_cudnn : absmax=0.
|
| 4488 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4489 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4490 |
-
torch_flash : absmax=0.
|
| 4491 |
-
torch_flash_compile_d : absmax=0.
|
| 4492 |
-
torch_flash_compile_ma : absmax=0.
|
| 4493 |
-
hf_flash_attn : absmax=0.
|
| 4494 |
-
hf_flash_attn3 : absmax=0.
|
| 4495 |
Attention Benchmark:
|
| 4496 |
seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
|
| 4497 |
-
0 4224.0 3.
|
| 4498 |
-
1 4352.0 4.
|
| 4499 |
-
2 4416.0 4.
|
| 4500 |
-
3 4480.0 4.
|
| 4501 |
-
4 4544.0 4.
|
| 4502 |
-
5 4608.0 4.
|
| 4503 |
</div>
|
| 4504 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4505 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4506 |
<div class="uv-logs-content" style="display: none;">
|
| 4507 |
-
Downloading kiwisolver (1.4MiB)
|
| 4508 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4509 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4510 |
-
Downloading pandas (11.8MiB)
|
| 4511 |
Downloading hf-xet (3.0MiB)
|
| 4512 |
-
Downloading nvidia-
|
| 4513 |
-
Downloading
|
| 4514 |
-
Downloading
|
| 4515 |
Downloading numpy (16.2MiB)
|
| 4516 |
-
Downloading torch (846.9MiB)
|
| 4517 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4518 |
-
Downloading
|
| 4519 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4520 |
-
Downloading
|
| 4521 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4522 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4523 |
-
Downloading sympy (6.0MiB)
|
| 4524 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4525 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4526 |
-
Downloading
|
|
|
|
|
|
|
| 4527 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4528 |
Downloading pillow (6.3MiB)
|
| 4529 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4530 |
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
| 4531 |
Downloading nvidia-cufile-cu12
|
| 4532 |
Downloading kiwisolver
|
| 4533 |
Downloading hf-xet
|
|
@@ -4541,8 +4541,8 @@ Downloading triton (148.3MiB)
|
|
| 4541 |
Downloading numpy
|
| 4542 |
Downloading nvidia-nvjitlink-cu12
|
| 4543 |
Downloading nvidia-curand-cu12
|
| 4544 |
-
Downloading nvidia-cuda-nvrtc-cu12
|
| 4545 |
Downloading pandas
|
|
|
|
| 4546 |
Downloading triton
|
| 4547 |
Downloading nvidia-cufft-cu12
|
| 4548 |
Downloading nvidia-cusolver-cu12
|
|
@@ -4552,18 +4552,18 @@ Downloading triton (148.3MiB)
|
|
| 4552 |
Downloading nvidia-cublas-cu12
|
| 4553 |
Downloading nvidia-cudnn-cu12
|
| 4554 |
Downloading torch
|
| 4555 |
-
Installed 49 packages in
|
| 4556 |
</div>
|
| 4557 |
</div>
|
| 4558 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4559 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.
|
| 4560 |
-
Fetching 20 files: 10%|█ | 2/20 [00:
|
| 4561 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:
|
| 4562 |
|
| 4563 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4564 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:
|
| 4565 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4566 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 4567 |
<div class="cell-artifacts">
|
| 4568 |
<h4>Artifacts:</h4>
|
| 4569 |
<a href="artifacts/benchmark/dump_attention_benchmark/Attention Benchmark.png" class="artifact" target="_blank">dump_attention_benchmark/Attention Benchmark.png</a>
|
|
|
|
| 3722 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
+
Cell: benchmark | 80.35s
|
| 3726 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3728 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4429 |
|
| 4430 |
|
| 4431 |
===== Testing shape: (1, 4224, 24, 128) =====
|
| 4432 |
+
torch_cudnn : absmax=0.001554, mae=0.000075, mse=0.000000
|
| 4433 |
+
torch_cudnn_compile_d : absmax=0.001554, mae=0.000075, mse=0.000000
|
| 4434 |
+
torch_cudnn_compile_ma : absmax=0.001554, mae=0.000075, mse=0.000000
|
| 4435 |
+
torch_flash : absmax=0.001554, mae=0.000075, mse=0.000000
|
| 4436 |
+
torch_flash_compile_d : absmax=0.001554, mae=0.000075, mse=0.000000
|
| 4437 |
+
torch_flash_compile_ma : absmax=0.001554, mae=0.000075, mse=0.000000
|
| 4438 |
+
hf_flash_attn : absmax=0.001554, mae=0.000075, mse=0.000000
|
| 4439 |
+
hf_flash_attn3 : absmax=0.001554, mae=0.000075, mse=0.000000
|
| 4440 |
|
| 4441 |
|
| 4442 |
===== Testing shape: (1, 4352, 24, 128) =====
|
| 4443 |
+
torch_cudnn : absmax=0.001499, mae=0.000074, mse=0.000000
|
| 4444 |
+
torch_cudnn_compile_d : absmax=0.001499, mae=0.000074, mse=0.000000
|
| 4445 |
+
torch_cudnn_compile_ma : absmax=0.001499, mae=0.000074, mse=0.000000
|
| 4446 |
+
torch_flash : absmax=0.001499, mae=0.000074, mse=0.000000
|
| 4447 |
+
torch_flash_compile_d : absmax=0.001499, mae=0.000074, mse=0.000000
|
| 4448 |
+
torch_flash_compile_ma : absmax=0.001499, mae=0.000074, mse=0.000000
|
| 4449 |
+
hf_flash_attn : absmax=0.001499, mae=0.000074, mse=0.000000
|
| 4450 |
+
hf_flash_attn3 : absmax=0.001499, mae=0.000074, mse=0.000000
|
| 4451 |
|
| 4452 |
|
| 4453 |
===== Testing shape: (1, 4416, 24, 128) =====
|
| 4454 |
+
torch_cudnn : absmax=0.001278, mae=0.000073, mse=0.000000
|
| 4455 |
+
torch_cudnn_compile_d : absmax=0.001278, mae=0.000073, mse=0.000000
|
| 4456 |
+
torch_cudnn_compile_ma : absmax=0.001278, mae=0.000073, mse=0.000000
|
| 4457 |
+
torch_flash : absmax=0.001278, mae=0.000073, mse=0.000000
|
| 4458 |
+
torch_flash_compile_d : absmax=0.001278, mae=0.000073, mse=0.000000
|
| 4459 |
+
torch_flash_compile_ma : absmax=0.001278, mae=0.000073, mse=0.000000
|
| 4460 |
+
hf_flash_attn : absmax=0.001278, mae=0.000073, mse=0.000000
|
| 4461 |
+
hf_flash_attn3 : absmax=0.001278, mae=0.000073, mse=0.000000
|
| 4462 |
|
| 4463 |
|
| 4464 |
===== Testing shape: (1, 4480, 24, 128) =====
|
| 4465 |
+
torch_cudnn : absmax=0.001270, mae=0.000073, mse=0.000000
|
| 4466 |
+
torch_cudnn_compile_d : absmax=0.001270, mae=0.000073, mse=0.000000
|
| 4467 |
+
torch_cudnn_compile_ma : absmax=0.001270, mae=0.000073, mse=0.000000
|
| 4468 |
+
torch_flash : absmax=0.001270, mae=0.000073, mse=0.000000
|
| 4469 |
+
torch_flash_compile_d : absmax=0.001270, mae=0.000073, mse=0.000000
|
| 4470 |
+
torch_flash_compile_ma : absmax=0.001270, mae=0.000073, mse=0.000000
|
| 4471 |
+
hf_flash_attn : absmax=0.001270, mae=0.000073, mse=0.000000
|
| 4472 |
+
hf_flash_attn3 : absmax=0.001270, mae=0.000073, mse=0.000000
|
| 4473 |
|
| 4474 |
|
| 4475 |
===== Testing shape: (1, 4544, 24, 128) =====
|
| 4476 |
+
torch_cudnn : absmax=0.001696, mae=0.000072, mse=0.000000
|
| 4477 |
+
torch_cudnn_compile_d : absmax=0.001696, mae=0.000072, mse=0.000000
|
| 4478 |
+
torch_cudnn_compile_ma : absmax=0.001696, mae=0.000072, mse=0.000000
|
| 4479 |
+
torch_flash : absmax=0.001696, mae=0.000072, mse=0.000000
|
| 4480 |
+
torch_flash_compile_d : absmax=0.001696, mae=0.000072, mse=0.000000
|
| 4481 |
+
torch_flash_compile_ma : absmax=0.001696, mae=0.000072, mse=0.000000
|
| 4482 |
+
hf_flash_attn : absmax=0.001696, mae=0.000072, mse=0.000000
|
| 4483 |
+
hf_flash_attn3 : absmax=0.001696, mae=0.000072, mse=0.000000
|
| 4484 |
|
| 4485 |
|
| 4486 |
===== Testing shape: (1, 4608, 24, 128) =====
|
| 4487 |
+
torch_cudnn : absmax=0.001111, mae=0.000071, mse=0.000000
|
| 4488 |
+
torch_cudnn_compile_d : absmax=0.001111, mae=0.000071, mse=0.000000
|
| 4489 |
+
torch_cudnn_compile_ma : absmax=0.001111, mae=0.000071, mse=0.000000
|
| 4490 |
+
torch_flash : absmax=0.001111, mae=0.000071, mse=0.000000
|
| 4491 |
+
torch_flash_compile_d : absmax=0.001111, mae=0.000071, mse=0.000000
|
| 4492 |
+
torch_flash_compile_ma : absmax=0.001111, mae=0.000071, mse=0.000000
|
| 4493 |
+
hf_flash_attn : absmax=0.001111, mae=0.000071, mse=0.000000
|
| 4494 |
+
hf_flash_attn3 : absmax=0.001111, mae=0.000071, mse=0.000000
|
| 4495 |
Attention Benchmark:
|
| 4496 |
seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
|
| 4497 |
+
0 4224.0 3.801536 3.793184 4.180320 3.967200 3.980720 4.318736 3.397664 3.333776
|
| 4498 |
+
1 4352.0 4.081888 4.075536 4.421904 4.400096 4.393040 4.733888 3.836736 3.760320
|
| 4499 |
+
2 4416.0 4.144176 4.139104 4.485120 4.453280 4.447408 4.796512 3.895568 3.863744
|
| 4500 |
+
3 4480.0 4.208256 4.204688 4.554240 4.529152 4.521568 4.874592 3.952256 3.870208
|
| 4501 |
+
4 4544.0 4.437280 4.431168 4.790432 4.584704 4.579008 4.938656 4.010512 3.978752
|
| 4502 |
+
5 4608.0 4.504352 4.497600 4.870880 4.661536 4.652544 5.031328 4.065504 3.985712
|
| 4503 |
</div>
|
| 4504 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4505 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4506 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4507 |
Downloading hf-xet (3.0MiB)
|
| 4508 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4509 |
+
Downloading sympy (6.0MiB)
|
| 4510 |
+
Downloading pandas (11.8MiB)
|
| 4511 |
Downloading numpy (16.2MiB)
|
|
|
|
| 4512 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4513 |
+
Downloading networkx (1.9MiB)
|
| 4514 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4515 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4516 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4517 |
+
Downloading matplotlib (8.3MiB)
|
| 4518 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4519 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4520 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4521 |
Downloading pillow (6.3MiB)
|
| 4522 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4523 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4524 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4525 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4526 |
+
Downloading fonttools (4.7MiB)
|
| 4527 |
+
Downloading setuptools (1.1MiB)
|
| 4528 |
Downloading triton (148.3MiB)
|
| 4529 |
+
Downloading torch (846.9MiB)
|
| 4530 |
+
Downloading kiwisolver (1.4MiB)
|
| 4531 |
Downloading nvidia-cufile-cu12
|
| 4532 |
Downloading kiwisolver
|
| 4533 |
Downloading hf-xet
|
|
|
|
| 4541 |
Downloading numpy
|
| 4542 |
Downloading nvidia-nvjitlink-cu12
|
| 4543 |
Downloading nvidia-curand-cu12
|
|
|
|
| 4544 |
Downloading pandas
|
| 4545 |
+
Downloading nvidia-cuda-nvrtc-cu12
|
| 4546 |
Downloading triton
|
| 4547 |
Downloading nvidia-cufft-cu12
|
| 4548 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4552 |
Downloading nvidia-cublas-cu12
|
| 4553 |
Downloading nvidia-cudnn-cu12
|
| 4554 |
Downloading torch
|
| 4555 |
+
Installed 49 packages in 546ms
|
| 4556 |
</div>
|
| 4557 |
</div>
|
| 4558 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4559 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.75it/s]
|
| 4560 |
+
Fetching 20 files: 10%|█ | 2/20 [00:01<00:16, 1.06it/s]
|
| 4561 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:01<00:00, 12.01it/s]
|
| 4562 |
|
| 4563 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4564 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:01, 1.98it/s]
|
| 4565 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.38it/s]
|
| 4566 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.89it/s]</div>
|
| 4567 |
<div class="cell-artifacts">
|
| 4568 |
<h4>Artifacts:</h4>
|
| 4569 |
<a href="artifacts/benchmark/dump_attention_benchmark/Attention Benchmark.png" class="artifact" target="_blank">dump_attention_benchmark/Attention Benchmark.png</a>
|
moe_benchmarks/megablocks/cells/forward_and_backward_no_kernel.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.12"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "accelerate>=1.10.1",
|
| 5 |
+
# "torch>=2.7.0",
|
| 6 |
+
# "kernels==0.10.0",
|
| 7 |
+
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
+
# "ipdb>=0.13.13",
|
| 9 |
+
# "matplotlib>=3.7.2",
|
| 10 |
+
# "numpy>=1.24.3",
|
| 11 |
+
# ]
|
| 12 |
+
# ///
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
+
import time
|
| 17 |
+
import torch.nn as nn
|
| 18 |
+
from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
|
| 19 |
+
import sys
|
| 20 |
+
import torch.profiler
|
| 21 |
+
import gc
|
| 22 |
+
import logging
|
| 23 |
+
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
|
| 24 |
+
|
| 25 |
+
# remove liger kernel for testing
|
| 26 |
+
replace_kernel_forward_from_hub(GptOssRMSNorm, None)
|
| 27 |
+
|
| 28 |
+
# set to debug logging
|
| 29 |
+
logging.basicConfig(level=logging.INFO)
|
| 30 |
+
|
| 31 |
+
def reset_peak_memory_stats():
|
| 32 |
+
"""Clear CUDA cache and reset memory allocation counters."""
|
| 33 |
+
torch.cuda.empty_cache()
|
| 34 |
+
if torch.cuda.is_available():
|
| 35 |
+
torch.cuda.reset_peak_memory_stats()
|
| 36 |
+
gc.collect()
|
| 37 |
+
|
| 38 |
+
def get_memory_stats():
|
| 39 |
+
"""Get current and peak CUDA memory usage."""
|
| 40 |
+
if not torch.cuda.is_available():
|
| 41 |
+
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 42 |
+
return {
|
| 43 |
+
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 44 |
+
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 45 |
+
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 49 |
+
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 50 |
+
for mod in sys.modules.values():
|
| 51 |
+
if mod is None:
|
| 52 |
+
continue
|
| 53 |
+
obj = getattr(mod, cls_name, None)
|
| 54 |
+
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 55 |
+
setattr(obj, "kernel_layer_name", value)
|
| 56 |
+
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 57 |
+
return True
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# Init the model the normal way
|
| 62 |
+
model_id = "openai/gpt-oss-20b"
|
| 63 |
+
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 64 |
+
quantization_config = Mxfp4Config(dequantize=True)
|
| 65 |
+
|
| 66 |
+
model = GptOssForCausalLM.from_pretrained(
|
| 67 |
+
model_id,
|
| 68 |
+
dtype="bfloat16",
|
| 69 |
+
device_map="auto",
|
| 70 |
+
use_kernels=False,
|
| 71 |
+
quantization_config=quantization_config,
|
| 72 |
+
).eval()
|
| 73 |
+
|
| 74 |
+
messages = [
|
| 75 |
+
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
inputs = tokenizer.apply_chat_template(
|
| 79 |
+
messages,
|
| 80 |
+
add_generation_prompt=True,
|
| 81 |
+
return_tensors="pt",
|
| 82 |
+
return_dict=True,
|
| 83 |
+
reasoning_effort="low",
|
| 84 |
+
).to("cuda")
|
| 85 |
+
|
| 86 |
+
max_tokens = 128 # Reduced to help with memory usage
|
| 87 |
+
|
| 88 |
+
# Clear memory before backward pass
|
| 89 |
+
reset_peak_memory_stats()
|
| 90 |
+
print(f"Pre-generation memory: {get_memory_stats()}")
|
| 91 |
+
|
| 92 |
+
# forward and backward pass
|
| 93 |
+
with torch.autograd.set_grad_enabled(True):
|
| 94 |
+
start_time = time.perf_counter()
|
| 95 |
+
generated = model.generate(
|
| 96 |
+
**inputs,
|
| 97 |
+
max_new_tokens=max_tokens,
|
| 98 |
+
do_sample=False,
|
| 99 |
+
temperature=None,
|
| 100 |
+
)
|
| 101 |
+
end_time = time.perf_counter()
|
| 102 |
+
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 103 |
+
print(f"Generation took {end_time - start_time:.2f} seconds")
|
| 104 |
+
print(f"Post-generation memory: {get_memory_stats()}")
|
| 105 |
+
|
| 106 |
+
# Use gradient checkpointing to reduce memory usage
|
| 107 |
+
if hasattr(model, 'gradient_checkpointing_enable'):
|
| 108 |
+
model.gradient_checkpointing_enable()
|
| 109 |
+
print("Enabled gradient checkpointing")
|
| 110 |
+
|
| 111 |
+
# Reduce sequence length if needed for memory
|
| 112 |
+
max_seq_len = 512 # Limit sequence length for backward pass
|
| 113 |
+
if generated.size(1) > max_seq_len:
|
| 114 |
+
print(f"Truncating sequence from {generated.size(1)} to {max_seq_len} tokens")
|
| 115 |
+
full_sequence = generated[:, -max_seq_len:]
|
| 116 |
+
else:
|
| 117 |
+
full_sequence = generated
|
| 118 |
+
|
| 119 |
+
# Get model outputs for the full sequence
|
| 120 |
+
model.train() # Enable dropout and other training behaviors
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
outputs = model(
|
| 124 |
+
input_ids=full_sequence,
|
| 125 |
+
labels=full_sequence, # This will compute loss internally
|
| 126 |
+
return_dict=True
|
| 127 |
+
)
|
| 128 |
+
print(f"Post-forward memory: {get_memory_stats()}")
|
| 129 |
+
|
| 130 |
+
# If model doesn't compute loss, compute it manually
|
| 131 |
+
if outputs.loss is None:
|
| 132 |
+
shift_logits = outputs.logits[..., :-1, :].contiguous()
|
| 133 |
+
shift_labels = full_sequence[..., 1:].contiguous()
|
| 134 |
+
|
| 135 |
+
# Use CrossEntropyLoss with ignore_index for padding tokens
|
| 136 |
+
loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100)
|
| 137 |
+
loss = loss_fct(
|
| 138 |
+
shift_logits.view(-1, shift_logits.size(-1)),
|
| 139 |
+
shift_labels.view(-1)
|
| 140 |
+
)
|
| 141 |
+
else:
|
| 142 |
+
loss = outputs.loss
|
| 143 |
+
|
| 144 |
+
print(f"Loss: {loss.item():.4f}")
|
| 145 |
+
|
| 146 |
+
# Clear intermediate tensors to save memory
|
| 147 |
+
del outputs
|
| 148 |
+
torch.cuda.empty_cache()
|
| 149 |
+
|
| 150 |
+
# Perform backward pass with memory management
|
| 151 |
+
print("Running backward pass...")
|
| 152 |
+
print(f"Pre-backward memory: {get_memory_stats()}")
|
| 153 |
+
|
| 154 |
+
loss.backward()
|
| 155 |
+
print(f"Post-backward memory: {get_memory_stats()}")
|
| 156 |
+
|
| 157 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 158 |
+
print(f"OOM during forward/backward pass: {e}")
|
| 159 |
+
print("Try reducing max_tokens or max_seq_len")
|
| 160 |
+
raise
|
| 161 |
+
|
| 162 |
+
# Calculate gradient statistics and print sample gradients
|
| 163 |
+
total_norm = 0.0
|
| 164 |
+
param_count = 0
|
| 165 |
+
grad_samples = {}
|
| 166 |
+
|
| 167 |
+
for name, p in model.named_parameters():
|
| 168 |
+
if p.grad is not None:
|
| 169 |
+
param_count += 1
|
| 170 |
+
grad_norm = p.grad.data.norm(2).item()
|
| 171 |
+
total_norm += grad_norm ** 2
|
| 172 |
+
|
| 173 |
+
# Collect gradient statistics for key layers
|
| 174 |
+
if any(key in name for key in ['embed', 'lm_head', 'mlp.up', 'mlp.down', 'self_attn.q_proj', 'norm']):
|
| 175 |
+
grad_samples[name] = {
|
| 176 |
+
'norm': grad_norm,
|
| 177 |
+
'mean': p.grad.data.mean().item(),
|
| 178 |
+
'std': p.grad.data.std().item(),
|
| 179 |
+
'max': p.grad.data.max().item(),
|
| 180 |
+
'min': p.grad.data.min().item(),
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
total_norm = total_norm ** 0.5
|
| 184 |
+
|
| 185 |
+
print(f"\nGradient norm: {total_norm:.4f}")
|
| 186 |
+
print(f"Parameters with gradients: {param_count}")
|
| 187 |
+
|
| 188 |
+
# Print sample gradients from important layers
|
| 189 |
+
print("\nSample gradient statistics:")
|
| 190 |
+
for i, (name, stats) in enumerate(list(grad_samples.items())[:10]):
|
| 191 |
+
print(f" {name[:60]:<60} | norm: {stats['norm']:.4e} | mean: {stats['mean']:.4e} | std: {stats['std']:.4e}")
|
| 192 |
+
|
| 193 |
+
# Optional: zero gradients for next iteration
|
| 194 |
+
model.zero_grad()
|
| 195 |
+
model.eval() # Switch back to eval mode
|
| 196 |
+
|
moe_benchmarks/megablocks/megablocks_only.html
CHANGED
|
@@ -3720,126 +3720,217 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3720 |
<h2>Forward</h2>
|
| 3721 |
<h2>Forward and Backward</h2>
|
| 3722 |
<p>Next, we'll attempt to run a forward and backward pass without any custom kernels. This will likely run out of memory since the default implementation is not optimized for memory usage.</p>
|
| 3723 |
-
<
|
| 3724 |
-
<p>Next we can run with Megablocks kernels enabled.</p>
|
| 3725 |
-
<h3>Forward</h3>
|
| 3726 |
-
<p>First, we run a forward pass with Megablocks kernels.</p>
|
| 3727 |
-
<div class="cell cell-failed" id="cell-forward_only">
|
| 3728 |
<div class="cell-header">
|
| 3729 |
<span class="collapse-indicators">
|
| 3730 |
-
<span onclick="toggleCode('
|
| 3731 |
-
<span onclick="toggleOutput('
|
| 3732 |
-
<span id="uv-indicator-
|
| 3733 |
</span> |
|
| 3734 |
-
Cell:
|
| 3735 |
-
| <button class="run-btn" onclick="runCell('
|
| 3736 |
-
<button class="copy-btn" onclick="copyCell('
|
| 3737 |
-
<a href="cells/
|
| 3738 |
</div>
|
| 3739 |
-
<div id="code-
|
| 3740 |
<div class="highlight-with-lines">
|
| 3741 |
-
<div class="line-numbers" id="lines-
|
| 3742 |
-
<a class="line-number" data-cell="
|
| 3743 |
-
<a class="line-number" data-cell="
|
| 3744 |
-
<a class="line-number" data-cell="
|
| 3745 |
-
<a class="line-number" data-cell="
|
| 3746 |
-
<a class="line-number" data-cell="
|
| 3747 |
-
<a class="line-number" data-cell="
|
| 3748 |
-
<a class="line-number" data-cell="
|
| 3749 |
-
<a class="line-number" data-cell="
|
| 3750 |
-
<a class="line-number" data-cell="
|
| 3751 |
-
<a class="line-number" data-cell="
|
| 3752 |
-
<a class="line-number" data-cell="
|
| 3753 |
-
<a class="line-number" data-cell="
|
| 3754 |
-
<a class="line-number" data-cell="
|
| 3755 |
-
<a class="line-number" data-cell="
|
| 3756 |
-
<a class="line-number" data-cell="
|
| 3757 |
-
<a class="line-number" data-cell="
|
| 3758 |
-
<a class="line-number" data-cell="
|
| 3759 |
-
<a class="line-number" data-cell="
|
| 3760 |
-
<a class="line-number" data-cell="
|
| 3761 |
-
<a class="line-number" data-cell="
|
| 3762 |
-
<a class="line-number" data-cell="
|
| 3763 |
-
<a class="line-number" data-cell="
|
| 3764 |
-
<a class="line-number" data-cell="
|
| 3765 |
-
<a class="line-number" data-cell="
|
| 3766 |
-
<a class="line-number" data-cell="
|
| 3767 |
-
<a class="line-number" data-cell="
|
| 3768 |
-
<a class="line-number" data-cell="
|
| 3769 |
-
<a class="line-number" data-cell="
|
| 3770 |
-
<a class="line-number" data-cell="
|
| 3771 |
-
<a class="line-number" data-cell="
|
| 3772 |
-
<a class="line-number" data-cell="
|
| 3773 |
-
<a class="line-number" data-cell="
|
| 3774 |
-
<a class="line-number" data-cell="
|
| 3775 |
-
<a class="line-number" data-cell="
|
| 3776 |
-
<a class="line-number" data-cell="
|
| 3777 |
-
<a class="line-number" data-cell="
|
| 3778 |
-
<a class="line-number" data-cell="
|
| 3779 |
-
<a class="line-number" data-cell="
|
| 3780 |
-
<a class="line-number" data-cell="
|
| 3781 |
-
<a class="line-number" data-cell="
|
| 3782 |
-
<a class="line-number" data-cell="
|
| 3783 |
-
<a class="line-number" data-cell="
|
| 3784 |
-
<a class="line-number" data-cell="
|
| 3785 |
-
<a class="line-number" data-cell="
|
| 3786 |
-
<a class="line-number" data-cell="
|
| 3787 |
-
<a class="line-number" data-cell="
|
| 3788 |
-
<a class="line-number" data-cell="
|
| 3789 |
-
<a class="line-number" data-cell="
|
| 3790 |
-
<a class="line-number" data-cell="
|
| 3791 |
-
<a class="line-number" data-cell="
|
| 3792 |
-
<a class="line-number" data-cell="
|
| 3793 |
-
<a class="line-number" data-cell="
|
| 3794 |
-
<a class="line-number" data-cell="
|
| 3795 |
-
<a class="line-number" data-cell="
|
| 3796 |
-
<a class="line-number" data-cell="
|
| 3797 |
-
<a class="line-number" data-cell="
|
| 3798 |
-
<a class="line-number" data-cell="
|
| 3799 |
-
<a class="line-number" data-cell="
|
| 3800 |
-
<a class="line-number" data-cell="
|
| 3801 |
-
<a class="line-number" data-cell="
|
| 3802 |
-
<a class="line-number" data-cell="
|
| 3803 |
-
<a class="line-number" data-cell="
|
| 3804 |
-
<a class="line-number" data-cell="
|
| 3805 |
-
<a class="line-number" data-cell="
|
| 3806 |
-
<a class="line-number" data-cell="
|
| 3807 |
-
<a class="line-number" data-cell="
|
| 3808 |
-
<a class="line-number" data-cell="
|
| 3809 |
-
<a class="line-number" data-cell="
|
| 3810 |
-
<a class="line-number" data-cell="
|
| 3811 |
-
<a class="line-number" data-cell="
|
| 3812 |
-
<a class="line-number" data-cell="
|
| 3813 |
-
<a class="line-number" data-cell="
|
| 3814 |
-
<a class="line-number" data-cell="
|
| 3815 |
-
<a class="line-number" data-cell="
|
| 3816 |
-
<a class="line-number" data-cell="
|
| 3817 |
-
<a class="line-number" data-cell="
|
| 3818 |
-
<a class="line-number" data-cell="
|
| 3819 |
-
<a class="line-number" data-cell="
|
| 3820 |
-
<a class="line-number" data-cell="
|
| 3821 |
-
<a class="line-number" data-cell="
|
| 3822 |
-
<a class="line-number" data-cell="
|
| 3823 |
-
<a class="line-number" data-cell="
|
| 3824 |
-
<a class="line-number" data-cell="
|
| 3825 |
-
<a class="line-number" data-cell="
|
| 3826 |
-
<a class="line-number" data-cell="
|
| 3827 |
-
<a class="line-number" data-cell="
|
| 3828 |
-
<a class="line-number" data-cell="
|
| 3829 |
-
<a class="line-number" data-cell="
|
| 3830 |
-
<a class="line-number" data-cell="
|
| 3831 |
-
<a class="line-number" data-cell="
|
| 3832 |
-
<a class="line-number" data-cell="
|
| 3833 |
-
<a class="line-number" data-cell="
|
| 3834 |
-
<a class="line-number" data-cell="
|
| 3835 |
-
<a class="line-number" data-cell="
|
| 3836 |
-
<a class="line-number" data-cell="
|
| 3837 |
-
<a class="line-number" data-cell="
|
| 3838 |
-
<a class="line-number" data-cell="
|
| 3839 |
-
<a class="line-number" data-cell="
|
| 3840 |
-
<a class="line-number" data-cell="
|
| 3841 |
-
<a class="line-number" data-cell="
|
| 3842 |
-
<a class="line-number" data-cell="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3843 |
</div>
|
| 3844 |
<div class="code-wrap">
|
| 3845 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
@@ -3866,7 +3957,7 @@ Cell: forward_only | 17.22s | FAILED
|
|
| 3866 |
<span class="kn">import</span><span class="w"> </span><span class="nn">logging</span>
|
| 3867 |
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssRMSNorm</span>
|
| 3868 |
|
| 3869 |
-
|
| 3870 |
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
| 3871 |
|
| 3872 |
<span class="c1"># set to debug logging</span>
|
|
@@ -3907,13 +3998,11 @@ Cell: forward_only | 17.22s | FAILED
|
|
| 3907 |
<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">PreTrainedTokenizerFast</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_id</span><span class="p">)</span>
|
| 3908 |
<span class="n">quantization_config</span> <span class="o">=</span> <span class="n">Mxfp4Config</span><span class="p">(</span><span class="n">dequantize</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
| 3909 |
|
| 3910 |
-
|
| 3911 |
-
|
| 3912 |
<span class="n">model</span> <span class="o">=</span> <span class="n">GptOssForCausalLM</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
|
| 3913 |
<span class="n">model_id</span><span class="p">,</span>
|
| 3914 |
<span class="n">dtype</span><span class="o">=</span><span class="s2">"bfloat16"</span><span class="p">,</span>
|
| 3915 |
<span class="n">device_map</span><span class="o">=</span><span class="s2">"auto"</span><span class="p">,</span>
|
| 3916 |
-
<span class="n">use_kernels</span><span class="o">=</span><span class="kc">
|
| 3917 |
<span class="n">quantization_config</span><span class="o">=</span><span class="n">quantization_config</span><span class="p">,</span>
|
| 3918 |
<span class="p">)</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
|
| 3919 |
|
|
@@ -3929,9 +4018,14 @@ Cell: forward_only | 17.22s | FAILED
|
|
| 3929 |
<span class="n">reasoning_effort</span><span class="o">=</span><span class="s2">"low"</span><span class="p">,</span>
|
| 3930 |
<span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s2">"cuda"</span><span class="p">)</span>
|
| 3931 |
|
| 3932 |
-
<span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">
|
| 3933 |
|
| 3934 |
-
<span class="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3935 |
<span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
|
| 3936 |
<span class="n">generated</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
|
| 3937 |
<span class="o">**</span><span class="n">inputs</span><span class="p">,</span>
|
|
@@ -3940,16 +4034,107 @@ Cell: forward_only | 17.22s | FAILED
|
|
| 3940 |
<span class="n">temperature</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
| 3941 |
<span class="p">)</span>
|
| 3942 |
<span class="n">end_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
|
| 3943 |
-
|
| 3944 |
-
<span class="nb">print</span><span class="p">(</span><span class="
|
| 3945 |
-
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3946 |
</pre></div>
|
| 3947 |
|
| 3948 |
-
<div class="code-line-highlight" id="line-highlight-
|
| 3949 |
</div>
|
| 3950 |
</div>
|
| 3951 |
</div>
|
| 3952 |
-
<div id="output-
|
| 3953 |
<div class="cell-stderr">warning: The requested interpreter resolved to Python 3.11.13, which is incompatible with the script's Python requirement: `>=3.12`
|
| 3954 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 3955 |
Updated https://github.com/huggingface/transformers.git (449533af73874470e914a203391635e04ac2ffc8)
|
|
@@ -3967,6 +4152,10 @@ Cell: forward_only | 17.22s | FAILED
|
|
| 3967 |
</div>
|
| 3968 |
</div>
|
| 3969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3970 |
<h2>Forward and Backward</h2>
|
| 3971 |
<p>Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.</p>
|
| 3972 |
</div>
|
|
|
|
| 3720 |
<h2>Forward</h2>
|
| 3721 |
<h2>Forward and Backward</h2>
|
| 3722 |
<p>Next, we'll attempt to run a forward and backward pass without any custom kernels. This will likely run out of memory since the default implementation is not optimized for memory usage.</p>
|
| 3723 |
+
<div class="cell cell-failed" id="cell-forward_and_backward_no_kernel">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3724 |
<div class="cell-header">
|
| 3725 |
<span class="collapse-indicators">
|
| 3726 |
+
<span onclick="toggleCode('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ code</span>
|
| 3727 |
+
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 3728 |
+
<span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3729 |
</span> |
|
| 3730 |
+
Cell: forward_and_backward_no_kernel | 17.02s | FAILED
|
| 3731 |
+
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 3732 |
+
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 3733 |
+
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
| 3734 |
</div>
|
| 3735 |
+
<div id="code-forward_and_backward_no_kernel" class="cell-code" data-lines="196">
|
| 3736 |
<div class="highlight-with-lines">
|
| 3737 |
+
<div class="line-numbers" id="lines-forward_and_backward_no_kernel">
|
| 3738 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="1" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 1, true);">1</a>
|
| 3739 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="2" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 2, true);">2</a>
|
| 3740 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="3" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 3, true);">3</a>
|
| 3741 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="4" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 4, true);">4</a>
|
| 3742 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="5" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 5, true);">5</a>
|
| 3743 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="6" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 6, true);">6</a>
|
| 3744 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="7" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 7, true);">7</a>
|
| 3745 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="8" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 8, true);">8</a>
|
| 3746 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="9" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 9, true);">9</a>
|
| 3747 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="10" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 10, true);">10</a>
|
| 3748 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="11" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 11, true);">11</a>
|
| 3749 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="12" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 12, true);">12</a>
|
| 3750 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="13" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 13, true);">13</a>
|
| 3751 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="14" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 14, true);">14</a>
|
| 3752 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="15" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 15, true);">15</a>
|
| 3753 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="16" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 16, true);">16</a>
|
| 3754 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="17" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 17, true);">17</a>
|
| 3755 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="18" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 18, true);">18</a>
|
| 3756 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="19" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 19, true);">19</a>
|
| 3757 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="20" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 20, true);">20</a>
|
| 3758 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="21" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 21, true);">21</a>
|
| 3759 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="22" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 22, true);">22</a>
|
| 3760 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="23" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 23, true);">23</a>
|
| 3761 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="24" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 24, true);">24</a>
|
| 3762 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="25" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 25, true);">25</a>
|
| 3763 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="26" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 26, true);">26</a>
|
| 3764 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="27" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 27, true);">27</a>
|
| 3765 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="28" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 28, true);">28</a>
|
| 3766 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="29" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 29, true);">29</a>
|
| 3767 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="30" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 30, true);">30</a>
|
| 3768 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="31" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 31, true);">31</a>
|
| 3769 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="32" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 32, true);">32</a>
|
| 3770 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="33" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 33, true);">33</a>
|
| 3771 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="34" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 34, true);">34</a>
|
| 3772 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="35" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 35, true);">35</a>
|
| 3773 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="36" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 36, true);">36</a>
|
| 3774 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="37" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 37, true);">37</a>
|
| 3775 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="38" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 38, true);">38</a>
|
| 3776 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="39" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 39, true);">39</a>
|
| 3777 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="40" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 40, true);">40</a>
|
| 3778 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="41" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 41, true);">41</a>
|
| 3779 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="42" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 42, true);">42</a>
|
| 3780 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="43" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 43, true);">43</a>
|
| 3781 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="44" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 44, true);">44</a>
|
| 3782 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="45" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 45, true);">45</a>
|
| 3783 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="46" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 46, true);">46</a>
|
| 3784 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="47" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 47, true);">47</a>
|
| 3785 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="48" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 48, true);">48</a>
|
| 3786 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="49" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 49, true);">49</a>
|
| 3787 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="50" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 50, true);">50</a>
|
| 3788 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="51" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 51, true);">51</a>
|
| 3789 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="52" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 52, true);">52</a>
|
| 3790 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="53" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 53, true);">53</a>
|
| 3791 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="54" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 54, true);">54</a>
|
| 3792 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="55" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 55, true);">55</a>
|
| 3793 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="56" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 56, true);">56</a>
|
| 3794 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="57" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 57, true);">57</a>
|
| 3795 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="58" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 58, true);">58</a>
|
| 3796 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="59" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 59, true);">59</a>
|
| 3797 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="60" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 60, true);">60</a>
|
| 3798 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="61" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 61, true);">61</a>
|
| 3799 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="62" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 62, true);">62</a>
|
| 3800 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="63" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 63, true);">63</a>
|
| 3801 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="64" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 64, true);">64</a>
|
| 3802 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="65" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 65, true);">65</a>
|
| 3803 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="66" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 66, true);">66</a>
|
| 3804 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="67" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 67, true);">67</a>
|
| 3805 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="68" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 68, true);">68</a>
|
| 3806 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="69" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 69, true);">69</a>
|
| 3807 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="70" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 70, true);">70</a>
|
| 3808 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="71" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 71, true);">71</a>
|
| 3809 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="72" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 72, true);">72</a>
|
| 3810 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="73" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 73, true);">73</a>
|
| 3811 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="74" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 74, true);">74</a>
|
| 3812 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="75" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 75, true);">75</a>
|
| 3813 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="76" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 76, true);">76</a>
|
| 3814 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="77" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 77, true);">77</a>
|
| 3815 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="78" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 78, true);">78</a>
|
| 3816 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="79" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 79, true);">79</a>
|
| 3817 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="80" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 80, true);">80</a>
|
| 3818 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="81" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 81, true);">81</a>
|
| 3819 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="82" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 82, true);">82</a>
|
| 3820 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="83" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 83, true);">83</a>
|
| 3821 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="84" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 84, true);">84</a>
|
| 3822 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="85" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 85, true);">85</a>
|
| 3823 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="86" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 86, true);">86</a>
|
| 3824 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="87" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 87, true);">87</a>
|
| 3825 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="88" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 88, true);">88</a>
|
| 3826 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="89" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 89, true);">89</a>
|
| 3827 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="90" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 90, true);">90</a>
|
| 3828 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="91" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 91, true);">91</a>
|
| 3829 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="92" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 92, true);">92</a>
|
| 3830 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="93" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 93, true);">93</a>
|
| 3831 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="94" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 94, true);">94</a>
|
| 3832 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="95" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 95, true);">95</a>
|
| 3833 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="96" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 96, true);">96</a>
|
| 3834 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="97" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 97, true);">97</a>
|
| 3835 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="98" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 98, true);">98</a>
|
| 3836 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="99" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 99, true);">99</a>
|
| 3837 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="100" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 100, true);">100</a>
|
| 3838 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="101" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 101, true);">101</a>
|
| 3839 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="102" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 102, true);">102</a>
|
| 3840 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="103" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 103, true);">103</a>
|
| 3841 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="104" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 104, true);">104</a>
|
| 3842 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="105" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 105, true);">105</a>
|
| 3843 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="106" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 106, true);">106</a>
|
| 3844 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="107" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 107, true);">107</a>
|
| 3845 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="108" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 108, true);">108</a>
|
| 3846 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="109" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 109, true);">109</a>
|
| 3847 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="110" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 110, true);">110</a>
|
| 3848 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="111" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 111, true);">111</a>
|
| 3849 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="112" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 112, true);">112</a>
|
| 3850 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="113" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 113, true);">113</a>
|
| 3851 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="114" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 114, true);">114</a>
|
| 3852 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="115" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 115, true);">115</a>
|
| 3853 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="116" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 116, true);">116</a>
|
| 3854 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="117" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 117, true);">117</a>
|
| 3855 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="118" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 118, true);">118</a>
|
| 3856 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="119" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 119, true);">119</a>
|
| 3857 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="120" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 120, true);">120</a>
|
| 3858 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="121" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 121, true);">121</a>
|
| 3859 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="122" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 122, true);">122</a>
|
| 3860 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="123" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 123, true);">123</a>
|
| 3861 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="124" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 124, true);">124</a>
|
| 3862 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="125" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 125, true);">125</a>
|
| 3863 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="126" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 126, true);">126</a>
|
| 3864 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="127" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 127, true);">127</a>
|
| 3865 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="128" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 128, true);">128</a>
|
| 3866 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="129" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 129, true);">129</a>
|
| 3867 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="130" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 130, true);">130</a>
|
| 3868 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="131" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 131, true);">131</a>
|
| 3869 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="132" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 132, true);">132</a>
|
| 3870 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="133" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 133, true);">133</a>
|
| 3871 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="134" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 134, true);">134</a>
|
| 3872 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="135" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 135, true);">135</a>
|
| 3873 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="136" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 136, true);">136</a>
|
| 3874 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="137" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 137, true);">137</a>
|
| 3875 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="138" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 138, true);">138</a>
|
| 3876 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="139" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 139, true);">139</a>
|
| 3877 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="140" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 140, true);">140</a>
|
| 3878 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="141" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 141, true);">141</a>
|
| 3879 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="142" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 142, true);">142</a>
|
| 3880 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="143" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 143, true);">143</a>
|
| 3881 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="144" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 144, true);">144</a>
|
| 3882 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="145" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 145, true);">145</a>
|
| 3883 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="146" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 146, true);">146</a>
|
| 3884 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="147" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 147, true);">147</a>
|
| 3885 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="148" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 148, true);">148</a>
|
| 3886 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="149" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 149, true);">149</a>
|
| 3887 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="150" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 150, true);">150</a>
|
| 3888 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="151" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 151, true);">151</a>
|
| 3889 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="152" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 152, true);">152</a>
|
| 3890 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="153" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 153, true);">153</a>
|
| 3891 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="154" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 154, true);">154</a>
|
| 3892 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="155" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 155, true);">155</a>
|
| 3893 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="156" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 156, true);">156</a>
|
| 3894 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="157" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 157, true);">157</a>
|
| 3895 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="158" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 158, true);">158</a>
|
| 3896 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="159" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 159, true);">159</a>
|
| 3897 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="160" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 160, true);">160</a>
|
| 3898 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="161" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 161, true);">161</a>
|
| 3899 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="162" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 162, true);">162</a>
|
| 3900 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="163" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 163, true);">163</a>
|
| 3901 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="164" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 164, true);">164</a>
|
| 3902 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="165" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 165, true);">165</a>
|
| 3903 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="166" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 166, true);">166</a>
|
| 3904 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="167" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 167, true);">167</a>
|
| 3905 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="168" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 168, true);">168</a>
|
| 3906 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="169" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 169, true);">169</a>
|
| 3907 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="170" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 170, true);">170</a>
|
| 3908 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="171" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 171, true);">171</a>
|
| 3909 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="172" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 172, true);">172</a>
|
| 3910 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="173" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 173, true);">173</a>
|
| 3911 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="174" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 174, true);">174</a>
|
| 3912 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="175" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 175, true);">175</a>
|
| 3913 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="176" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 176, true);">176</a>
|
| 3914 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="177" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 177, true);">177</a>
|
| 3915 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="178" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 178, true);">178</a>
|
| 3916 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="179" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 179, true);">179</a>
|
| 3917 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="180" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 180, true);">180</a>
|
| 3918 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="181" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 181, true);">181</a>
|
| 3919 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="182" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 182, true);">182</a>
|
| 3920 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="183" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 183, true);">183</a>
|
| 3921 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="184" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 184, true);">184</a>
|
| 3922 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="185" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 185, true);">185</a>
|
| 3923 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="186" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 186, true);">186</a>
|
| 3924 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="187" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 187, true);">187</a>
|
| 3925 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="188" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 188, true);">188</a>
|
| 3926 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="189" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 189, true);">189</a>
|
| 3927 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="190" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 190, true);">190</a>
|
| 3928 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="191" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 191, true);">191</a>
|
| 3929 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="192" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 192, true);">192</a>
|
| 3930 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="193" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 193, true);">193</a>
|
| 3931 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="194" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 194, true);">194</a>
|
| 3932 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="195" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 195, true);">195</a>
|
| 3933 |
+
<a class="line-number" data-cell="forward_and_backward_no_kernel" data-line="196" href="#cell-forward_and_backward_no_kernel" onclick="event.preventDefault(); selectCellLine('forward_and_backward_no_kernel', 196, true);">196</a>
|
| 3934 |
</div>
|
| 3935 |
<div class="code-wrap">
|
| 3936 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
|
|
| 3957 |
<span class="kn">import</span><span class="w"> </span><span class="nn">logging</span>
|
| 3958 |
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssRMSNorm</span>
|
| 3959 |
|
| 3960 |
+
<span class="c1"># remove liger kernel for testing </span>
|
| 3961 |
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
| 3962 |
|
| 3963 |
<span class="c1"># set to debug logging</span>
|
|
|
|
| 3998 |
<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">PreTrainedTokenizerFast</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_id</span><span class="p">)</span>
|
| 3999 |
<span class="n">quantization_config</span> <span class="o">=</span> <span class="n">Mxfp4Config</span><span class="p">(</span><span class="n">dequantize</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
| 4000 |
|
|
|
|
|
|
|
| 4001 |
<span class="n">model</span> <span class="o">=</span> <span class="n">GptOssForCausalLM</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
|
| 4002 |
<span class="n">model_id</span><span class="p">,</span>
|
| 4003 |
<span class="n">dtype</span><span class="o">=</span><span class="s2">"bfloat16"</span><span class="p">,</span>
|
| 4004 |
<span class="n">device_map</span><span class="o">=</span><span class="s2">"auto"</span><span class="p">,</span>
|
| 4005 |
+
<span class="n">use_kernels</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
| 4006 |
<span class="n">quantization_config</span><span class="o">=</span><span class="n">quantization_config</span><span class="p">,</span>
|
| 4007 |
<span class="p">)</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
|
| 4008 |
|
|
|
|
| 4018 |
<span class="n">reasoning_effort</span><span class="o">=</span><span class="s2">"low"</span><span class="p">,</span>
|
| 4019 |
<span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s2">"cuda"</span><span class="p">)</span>
|
| 4020 |
|
| 4021 |
+
<span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">128</span> <span class="c1"># Reduced to help with memory usage</span>
|
| 4022 |
|
| 4023 |
+
<span class="c1"># Clear memory before backward pass</span>
|
| 4024 |
+
<span class="n">reset_peak_memory_stats</span><span class="p">()</span>
|
| 4025 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Pre-generation memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4026 |
+
|
| 4027 |
+
<span class="c1"># forward and backward pass</span>
|
| 4028 |
+
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">autograd</span><span class="o">.</span><span class="n">set_grad_enabled</span><span class="p">(</span><span class="kc">True</span><span class="p">):</span>
|
| 4029 |
<span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
|
| 4030 |
<span class="n">generated</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
|
| 4031 |
<span class="o">**</span><span class="n">inputs</span><span class="p">,</span>
|
|
|
|
| 4034 |
<span class="n">temperature</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
| 4035 |
<span class="p">)</span>
|
| 4036 |
<span class="n">end_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
|
| 4037 |
+
<span class="nb">print</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">generated</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">skip_special_tokens</span><span class="o">=</span><span class="kc">False</span><span class="p">))</span>
|
| 4038 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds"</span><span class="p">)</span>
|
| 4039 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Post-generation memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4040 |
+
|
| 4041 |
+
<span class="c1"># Use gradient checkpointing to reduce memory usage</span>
|
| 4042 |
+
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="s1">'gradient_checkpointing_enable'</span><span class="p">):</span>
|
| 4043 |
+
<span class="n">model</span><span class="o">.</span><span class="n">gradient_checkpointing_enable</span><span class="p">()</span>
|
| 4044 |
+
<span class="nb">print</span><span class="p">(</span><span class="s2">"Enabled gradient checkpointing"</span><span class="p">)</span>
|
| 4045 |
+
|
| 4046 |
+
<span class="c1"># Reduce sequence length if needed for memory</span>
|
| 4047 |
+
<span class="n">max_seq_len</span> <span class="o">=</span> <span class="mi">512</span> <span class="c1"># Limit sequence length for backward pass</span>
|
| 4048 |
+
<span class="k">if</span> <span class="n">generated</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> <span class="o">></span> <span class="n">max_seq_len</span><span class="p">:</span>
|
| 4049 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Truncating sequence from </span><span class="si">{</span><span class="n">generated</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="si">}</span><span class="s2"> to </span><span class="si">{</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2"> tokens"</span><span class="p">)</span>
|
| 4050 |
+
<span class="n">full_sequence</span> <span class="o">=</span> <span class="n">generated</span><span class="p">[:,</span> <span class="o">-</span><span class="n">max_seq_len</span><span class="p">:]</span>
|
| 4051 |
+
<span class="k">else</span><span class="p">:</span>
|
| 4052 |
+
<span class="n">full_sequence</span> <span class="o">=</span> <span class="n">generated</span>
|
| 4053 |
+
|
| 4054 |
+
<span class="c1"># Get model outputs for the full sequence</span>
|
| 4055 |
+
<span class="n">model</span><span class="o">.</span><span class="n">train</span><span class="p">()</span> <span class="c1"># Enable dropout and other training behaviors</span>
|
| 4056 |
+
|
| 4057 |
+
<span class="k">try</span><span class="p">:</span>
|
| 4058 |
+
<span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span>
|
| 4059 |
+
<span class="n">input_ids</span><span class="o">=</span><span class="n">full_sequence</span><span class="p">,</span>
|
| 4060 |
+
<span class="n">labels</span><span class="o">=</span><span class="n">full_sequence</span><span class="p">,</span> <span class="c1"># This will compute loss internally</span>
|
| 4061 |
+
<span class="n">return_dict</span><span class="o">=</span><span class="kc">True</span>
|
| 4062 |
+
<span class="p">)</span>
|
| 4063 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Post-forward memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4064 |
+
|
| 4065 |
+
<span class="c1"># If model doesn't compute loss, compute it manually</span>
|
| 4066 |
+
<span class="k">if</span> <span class="n">outputs</span><span class="o">.</span><span class="n">loss</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
| 4067 |
+
<span class="n">shift_logits</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">:</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="p">:]</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
|
| 4068 |
+
<span class="n">shift_labels</span> <span class="o">=</span> <span class="n">full_sequence</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">1</span><span class="p">:]</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
|
| 4069 |
+
|
| 4070 |
+
<span class="c1"># Use CrossEntropyLoss with ignore_index for padding tokens</span>
|
| 4071 |
+
<span class="n">loss_fct</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">CrossEntropyLoss</span><span class="p">(</span><span class="n">ignore_index</span><span class="o">=</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">pad_token_id</span> <span class="k">if</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">pad_token_id</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="o">-</span><span class="mi">100</span><span class="p">)</span>
|
| 4072 |
+
<span class="n">loss</span> <span class="o">=</span> <span class="n">loss_fct</span><span class="p">(</span>
|
| 4073 |
+
<span class="n">shift_logits</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">shift_logits</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)),</span>
|
| 4074 |
+
<span class="n">shift_labels</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
|
| 4075 |
+
<span class="p">)</span>
|
| 4076 |
+
<span class="k">else</span><span class="p">:</span>
|
| 4077 |
+
<span class="n">loss</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">loss</span>
|
| 4078 |
+
|
| 4079 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Loss: </span><span class="si">{</span><span class="n">loss</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4080 |
+
|
| 4081 |
+
<span class="c1"># Clear intermediate tensors to save memory</span>
|
| 4082 |
+
<span class="k">del</span> <span class="n">outputs</span>
|
| 4083 |
+
<span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">empty_cache</span><span class="p">()</span>
|
| 4084 |
+
|
| 4085 |
+
<span class="c1"># Perform backward pass with memory management</span>
|
| 4086 |
+
<span class="nb">print</span><span class="p">(</span><span class="s2">"Running backward pass..."</span><span class="p">)</span>
|
| 4087 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Pre-backward memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4088 |
+
|
| 4089 |
+
<span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span>
|
| 4090 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Post-backward memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4091 |
+
|
| 4092 |
+
<span class="k">except</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">OutOfMemoryError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
|
| 4093 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"OOM during forward/backward pass: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4094 |
+
<span class="nb">print</span><span class="p">(</span><span class="s2">"Try reducing max_tokens or max_seq_len"</span><span class="p">)</span>
|
| 4095 |
+
<span class="k">raise</span>
|
| 4096 |
+
|
| 4097 |
+
<span class="c1"># Calculate gradient statistics and print sample gradients</span>
|
| 4098 |
+
<span class="n">total_norm</span> <span class="o">=</span> <span class="mf">0.0</span>
|
| 4099 |
+
<span class="n">param_count</span> <span class="o">=</span> <span class="mi">0</span>
|
| 4100 |
+
<span class="n">grad_samples</span> <span class="o">=</span> <span class="p">{}</span>
|
| 4101 |
+
|
| 4102 |
+
<span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">named_parameters</span><span class="p">():</span>
|
| 4103 |
+
<span class="k">if</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
| 4104 |
+
<span class="n">param_count</span> <span class="o">+=</span> <span class="mi">1</span>
|
| 4105 |
+
<span class="n">grad_norm</span> <span class="o">=</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">item</span><span class="p">()</span>
|
| 4106 |
+
<span class="n">total_norm</span> <span class="o">+=</span> <span class="n">grad_norm</span> <span class="o">**</span> <span class="mi">2</span>
|
| 4107 |
+
|
| 4108 |
+
<span class="c1"># Collect gradient statistics for key layers</span>
|
| 4109 |
+
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">key</span> <span class="ow">in</span> <span class="n">name</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'embed'</span><span class="p">,</span> <span class="s1">'lm_head'</span><span class="p">,</span> <span class="s1">'mlp.up'</span><span class="p">,</span> <span class="s1">'mlp.down'</span><span class="p">,</span> <span class="s1">'self_attn.q_proj'</span><span class="p">,</span> <span class="s1">'norm'</span><span class="p">]):</span>
|
| 4110 |
+
<span class="n">grad_samples</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
|
| 4111 |
+
<span class="s1">'norm'</span><span class="p">:</span> <span class="n">grad_norm</span><span class="p">,</span>
|
| 4112 |
+
<span class="s1">'mean'</span><span class="p">:</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">(),</span>
|
| 4113 |
+
<span class="s1">'std'</span><span class="p">:</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">std</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">(),</span>
|
| 4114 |
+
<span class="s1">'max'</span><span class="p">:</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">(),</span>
|
| 4115 |
+
<span class="s1">'min'</span><span class="p">:</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">min</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">(),</span>
|
| 4116 |
+
<span class="p">}</span>
|
| 4117 |
+
|
| 4118 |
+
<span class="n">total_norm</span> <span class="o">=</span> <span class="n">total_norm</span> <span class="o">**</span> <span class="mf">0.5</span>
|
| 4119 |
+
|
| 4120 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="se">\n</span><span class="s2">Gradient norm: </span><span class="si">{</span><span class="n">total_norm</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4121 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Parameters with gradients: </span><span class="si">{</span><span class="n">param_count</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4122 |
+
|
| 4123 |
+
<span class="c1"># Print sample gradients from important layers</span>
|
| 4124 |
+
<span class="nb">print</span><span class="p">(</span><span class="s2">"</span><span class="se">\n</span><span class="s2">Sample gradient statistics:"</span><span class="p">)</span>
|
| 4125 |
+
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">stats</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">grad_samples</span><span class="o">.</span><span class="n">items</span><span class="p">())[:</span><span class="mi">10</span><span class="p">]):</span>
|
| 4126 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">" </span><span class="si">{</span><span class="n">name</span><span class="p">[:</span><span class="mi">60</span><span class="p">]</span><span class="si">:</span><span class="s2"><60</span><span class="si">}</span><span class="s2"> | norm: </span><span class="si">{</span><span class="n">stats</span><span class="p">[</span><span class="s1">'norm'</span><span class="p">]</span><span class="si">:</span><span class="s2">.4e</span><span class="si">}</span><span class="s2"> | mean: </span><span class="si">{</span><span class="n">stats</span><span class="p">[</span><span class="s1">'mean'</span><span class="p">]</span><span class="si">:</span><span class="s2">.4e</span><span class="si">}</span><span class="s2"> | std: </span><span class="si">{</span><span class="n">stats</span><span class="p">[</span><span class="s1">'std'</span><span class="p">]</span><span class="si">:</span><span class="s2">.4e</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4127 |
+
|
| 4128 |
+
<span class="c1"># Optional: zero gradients for next iteration</span>
|
| 4129 |
+
<span class="n">model</span><span class="o">.</span><span class="n">zero_grad</span><span class="p">()</span>
|
| 4130 |
+
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span> <span class="c1"># Switch back to eval mode</span>
|
| 4131 |
</pre></div>
|
| 4132 |
|
| 4133 |
+
<div class="code-line-highlight" id="line-highlight-forward_and_backward_no_kernel"></div>
|
| 4134 |
</div>
|
| 4135 |
</div>
|
| 4136 |
</div>
|
| 4137 |
+
<div id="output-forward_and_backward_no_kernel" class="cell-output">
|
| 4138 |
<div class="cell-stderr">warning: The requested interpreter resolved to Python 3.11.13, which is incompatible with the script's Python requirement: `>=3.12`
|
| 4139 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4140 |
Updated https://github.com/huggingface/transformers.git (449533af73874470e914a203391635e04ac2ffc8)
|
|
|
|
| 4152 |
</div>
|
| 4153 |
</div>
|
| 4154 |
|
| 4155 |
+
<h1>Kernels</h1>
|
| 4156 |
+
<p>Next we can run with Megablocks kernels enabled.</p>
|
| 4157 |
+
<h3>Forward</h3>
|
| 4158 |
+
<p>First, we run a forward pass with Megablocks kernels.</p>
|
| 4159 |
<h2>Forward and Backward</h2>
|
| 4160 |
<p>Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.</p>
|
| 4161 |
</div>
|
moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 36.
|
| 13 |
-
"min_ms":
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 1.
|
| 16 |
-
"p50_ms": 36.
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 36.727162420011155,
|
| 13 |
+
"min_ms": 33.18469400005597,
|
| 14 |
+
"max_ms": 39.16655900002297,
|
| 15 |
+
"std_ms": 1.746945665597604,
|
| 16 |
+
"p50_ms": 36.9490834999624,
|
| 17 |
+
"p95_ms": 38.664125449997755,
|
| 18 |
+
"p99_ms": 38.97265620000894,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2722.7804548688473,
|
| 21 |
+
"throughput_variance": 132.97041596249102
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 45.
|
| 13 |
-
"min_ms":
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms":
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 45.42313148000403,
|
| 13 |
+
"min_ms": 38.7863710000147,
|
| 14 |
+
"max_ms": 51.6831769999726,
|
| 15 |
+
"std_ms": 3.6706212937228724,
|
| 16 |
+
"p50_ms": 45.48181749999003,
|
| 17 |
+
"p95_ms": 50.886562249957024,
|
| 18 |
+
"p99_ms": 51.415023030010616,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2201.521487879398,
|
| 21 |
+
"throughput_variance": 179.47325216420964
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms":
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms":
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 47.223061700003655,
|
| 13 |
+
"min_ms": 40.87468699992769,
|
| 14 |
+
"max_ms": 51.39806599993335,
|
| 15 |
+
"std_ms": 3.1109522065149875,
|
| 16 |
+
"p50_ms": 47.68390750001572,
|
| 17 |
+
"p95_ms": 51.25916855005812,
|
| 18 |
+
"p99_ms": 51.36193830001844,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2117.6094137071223,
|
| 21 |
+
"throughput_variance": 143.64050741882156
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 3.
|
| 13 |
-
"min_ms": 0.
|
| 14 |
-
"max_ms": 8.
|
| 15 |
-
"std_ms": 3.
|
| 16 |
-
"p50_ms": 0.
|
| 17 |
-
"p95_ms": 8.
|
| 18 |
-
"p99_ms": 8.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 6.4738850593566895
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 3.829988559998583,
|
| 13 |
+
"min_ms": 0.8193480000500131,
|
| 14 |
+
"max_ms": 8.440041999961068,
|
| 15 |
+
"std_ms": 3.6590563761858443,
|
| 16 |
+
"p50_ms": 0.8861204999561778,
|
| 17 |
+
"p95_ms": 8.437156600047047,
|
| 18 |
+
"p99_ms": 8.439223699980403,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 26109.738562779676,
|
| 21 |
+
"throughput_variance": 51610.70870547587
|
| 22 |
},
|
| 23 |
"output_sum": 6.4738850593566895
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 4.
|
| 13 |
-
"min_ms": 4.
|
| 14 |
-
"max_ms": 4.
|
| 15 |
-
"std_ms": 0.
|
| 16 |
-
"p50_ms": 4.
|
| 17 |
-
"p95_ms": 4.
|
| 18 |
-
"p99_ms": 4.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 4.248629200003506,
|
| 13 |
+
"min_ms": 4.131733000008353,
|
| 14 |
+
"max_ms": 4.277987000023131,
|
| 15 |
+
"std_ms": 0.02059489131384697,
|
| 16 |
+
"p50_ms": 4.252681999957986,
|
| 17 |
+
"p95_ms": 4.26323250002838,
|
| 18 |
+
"p99_ms": 4.27199430000428,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 23537.003417459324,
|
| 21 |
+
"throughput_variance": 116.25267146840179
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc
CHANGED
|
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
|
|
|
moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
|
|
|
moe_benchmarks/megablocks_yamoe/cells/setup2.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.12"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "accelerate>=1.10.1",
|
| 5 |
+
# "torch>=2.7.0",
|
| 6 |
+
# "kernels==0.10.0",
|
| 7 |
+
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
+
# "ipdb>=0.13.13",
|
| 9 |
+
# "matplotlib>=3.7.2",
|
| 10 |
+
# "numpy>=1.24.3",
|
| 11 |
+
# ]
|
| 12 |
+
# ///
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
+
import time
|
| 17 |
+
import torch.nn as nn
|
| 18 |
+
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
+
import sys
|
| 20 |
+
import torch.profiler
|
| 21 |
+
import gc
|
| 22 |
+
import logging
|
| 23 |
+
|
| 24 |
+
# set to debug logging
|
| 25 |
+
logging.basicConfig(level=logging.INFO)
|
| 26 |
+
|
| 27 |
+
def reset_peak_memory_stats():
|
| 28 |
+
"""Clear CUDA cache and reset memory allocation counters."""
|
| 29 |
+
torch.cuda.empty_cache()
|
| 30 |
+
if torch.cuda.is_available():
|
| 31 |
+
torch.cuda.reset_peak_memory_stats()
|
| 32 |
+
gc.collect()
|
| 33 |
+
|
| 34 |
+
def get_memory_stats():
|
| 35 |
+
"""Get current and peak CUDA memory usage."""
|
| 36 |
+
if not torch.cuda.is_available():
|
| 37 |
+
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 38 |
+
return {
|
| 39 |
+
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 40 |
+
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 41 |
+
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 45 |
+
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 46 |
+
for mod in sys.modules.values():
|
| 47 |
+
if mod is None:
|
| 48 |
+
continue
|
| 49 |
+
obj = getattr(mod, cls_name, None)
|
| 50 |
+
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 51 |
+
setattr(obj, "kernel_layer_name", value)
|
| 52 |
+
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 53 |
+
return True
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# Init the model the normal way
|
| 58 |
+
model_id = "openai/gpt-oss-20b"
|
| 59 |
+
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 60 |
+
quantization_config = Mxfp4Config(dequantize=True)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
|
| 64 |
+
|
| 65 |
+
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
|
| 66 |
+
|
| 67 |
+
replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
|
| 68 |
+
custom_mapping = {
|
| 69 |
+
"Yamoe": {
|
| 70 |
+
"cuda": {
|
| 71 |
+
Mode.INFERENCE: LayerRepository(
|
| 72 |
+
repo_id="drbh/yamoe",
|
| 73 |
+
layer_name="Yamoe",
|
| 74 |
+
revision="v0.3.0",
|
| 75 |
+
)
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
register_kernel_mapping(custom_mapping)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
model = GptOssForCausalLM.from_pretrained(
|
| 83 |
+
model_id,
|
| 84 |
+
dtype="bfloat16",
|
| 85 |
+
device_map="auto",
|
| 86 |
+
use_kernels=True,
|
| 87 |
+
quantization_config=quantization_config,
|
| 88 |
+
).eval()
|
| 89 |
+
|
| 90 |
+
messages = [
|
| 91 |
+
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
inputs = tokenizer.apply_chat_template(
|
| 95 |
+
messages,
|
| 96 |
+
add_generation_prompt=True,
|
| 97 |
+
return_tensors="pt",
|
| 98 |
+
return_dict=True,
|
| 99 |
+
reasoning_effort="low",
|
| 100 |
+
).to("cuda")
|
| 101 |
+
|
| 102 |
+
max_tokens = 256
|
| 103 |
+
|
| 104 |
+
with torch.inference_mode():
|
| 105 |
+
start_time = time.perf_counter()
|
| 106 |
+
generated = model.generate(
|
| 107 |
+
**inputs,
|
| 108 |
+
max_new_tokens=max_tokens,
|
| 109 |
+
do_sample=False,
|
| 110 |
+
temperature=None,
|
| 111 |
+
)
|
| 112 |
+
end_time = time.perf_counter()
|
| 113 |
+
|
| 114 |
+
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 115 |
+
print(f"Generation took {end_time - start_time:.2f} seconds")
|
moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html
CHANGED
|
@@ -3718,138 +3718,136 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3718 |
<h1>Comparison of Megablocks and Yamoe Kernels</h1>
|
| 3719 |
<p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
|
| 3720 |
<h2>Megablocks kernel</h2>
|
| 3721 |
-
<
|
| 3722 |
-
<div class="cell cell-failed" id="cell-setup">
|
| 3723 |
<div class="cell-header">
|
| 3724 |
<span class="collapse-indicators">
|
| 3725 |
-
<span onclick="toggleCode('
|
| 3726 |
-
<span onclick="toggleOutput('
|
| 3727 |
-
<span id="uv-indicator-
|
| 3728 |
</span> |
|
| 3729 |
-
Cell:
|
| 3730 |
-
| <button class="run-btn" onclick="runCell('
|
| 3731 |
-
<button class="copy-btn" onclick="copyCell('
|
| 3732 |
-
<a href="cells/
|
| 3733 |
</div>
|
| 3734 |
-
<div id="code-
|
| 3735 |
<div class="highlight-with-lines">
|
| 3736 |
-
<div class="line-numbers" id="lines-
|
| 3737 |
-
<a class="line-number" data-cell="
|
| 3738 |
-
<a class="line-number" data-cell="
|
| 3739 |
-
<a class="line-number" data-cell="
|
| 3740 |
-
<a class="line-number" data-cell="
|
| 3741 |
-
<a class="line-number" data-cell="
|
| 3742 |
-
<a class="line-number" data-cell="
|
| 3743 |
-
<a class="line-number" data-cell="
|
| 3744 |
-
<a class="line-number" data-cell="
|
| 3745 |
-
<a class="line-number" data-cell="
|
| 3746 |
-
<a class="line-number" data-cell="
|
| 3747 |
-
<a class="line-number" data-cell="
|
| 3748 |
-
<a class="line-number" data-cell="
|
| 3749 |
-
<a class="line-number" data-cell="
|
| 3750 |
-
<a class="line-number" data-cell="
|
| 3751 |
-
<a class="line-number" data-cell="
|
| 3752 |
-
<a class="line-number" data-cell="
|
| 3753 |
-
<a class="line-number" data-cell="
|
| 3754 |
-
<a class="line-number" data-cell="
|
| 3755 |
-
<a class="line-number" data-cell="
|
| 3756 |
-
<a class="line-number" data-cell="
|
| 3757 |
-
<a class="line-number" data-cell="
|
| 3758 |
-
<a class="line-number" data-cell="
|
| 3759 |
-
<a class="line-number" data-cell="
|
| 3760 |
-
<a class="line-number" data-cell="
|
| 3761 |
-
<a class="line-number" data-cell="
|
| 3762 |
-
<a class="line-number" data-cell="
|
| 3763 |
-
<a class="line-number" data-cell="
|
| 3764 |
-
<a class="line-number" data-cell="
|
| 3765 |
-
<a class="line-number" data-cell="
|
| 3766 |
-
<a class="line-number" data-cell="
|
| 3767 |
-
<a class="line-number" data-cell="
|
| 3768 |
-
<a class="line-number" data-cell="
|
| 3769 |
-
<a class="line-number" data-cell="
|
| 3770 |
-
<a class="line-number" data-cell="
|
| 3771 |
-
<a class="line-number" data-cell="
|
| 3772 |
-
<a class="line-number" data-cell="
|
| 3773 |
-
<a class="line-number" data-cell="
|
| 3774 |
-
<a class="line-number" data-cell="
|
| 3775 |
-
<a class="line-number" data-cell="
|
| 3776 |
-
<a class="line-number" data-cell="
|
| 3777 |
-
<a class="line-number" data-cell="
|
| 3778 |
-
<a class="line-number" data-cell="
|
| 3779 |
-
<a class="line-number" data-cell="
|
| 3780 |
-
<a class="line-number" data-cell="
|
| 3781 |
-
<a class="line-number" data-cell="
|
| 3782 |
-
<a class="line-number" data-cell="
|
| 3783 |
-
<a class="line-number" data-cell="
|
| 3784 |
-
<a class="line-number" data-cell="
|
| 3785 |
-
<a class="line-number" data-cell="
|
| 3786 |
-
<a class="line-number" data-cell="
|
| 3787 |
-
<a class="line-number" data-cell="
|
| 3788 |
-
<a class="line-number" data-cell="
|
| 3789 |
-
<a class="line-number" data-cell="
|
| 3790 |
-
<a class="line-number" data-cell="
|
| 3791 |
-
<a class="line-number" data-cell="
|
| 3792 |
-
<a class="line-number" data-cell="
|
| 3793 |
-
<a class="line-number" data-cell="
|
| 3794 |
-
<a class="line-number" data-cell="
|
| 3795 |
-
<a class="line-number" data-cell="
|
| 3796 |
-
<a class="line-number" data-cell="
|
| 3797 |
-
<a class="line-number" data-cell="
|
| 3798 |
-
<a class="line-number" data-cell="
|
| 3799 |
-
<a class="line-number" data-cell="
|
| 3800 |
-
<a class="line-number" data-cell="
|
| 3801 |
-
<a class="line-number" data-cell="
|
| 3802 |
-
<a class="line-number" data-cell="
|
| 3803 |
-
<a class="line-number" data-cell="
|
| 3804 |
-
<a class="line-number" data-cell="
|
| 3805 |
-
<a class="line-number" data-cell="
|
| 3806 |
-
<a class="line-number" data-cell="
|
| 3807 |
-
<a class="line-number" data-cell="
|
| 3808 |
-
<a class="line-number" data-cell="
|
| 3809 |
-
<a class="line-number" data-cell="
|
| 3810 |
-
<a class="line-number" data-cell="
|
| 3811 |
-
<a class="line-number" data-cell="
|
| 3812 |
-
<a class="line-number" data-cell="
|
| 3813 |
-
<a class="line-number" data-cell="
|
| 3814 |
-
<a class="line-number" data-cell="
|
| 3815 |
-
<a class="line-number" data-cell="
|
| 3816 |
-
<a class="line-number" data-cell="
|
| 3817 |
-
<a class="line-number" data-cell="
|
| 3818 |
-
<a class="line-number" data-cell="
|
| 3819 |
-
<a class="line-number" data-cell="
|
| 3820 |
-
<a class="line-number" data-cell="
|
| 3821 |
-
<a class="line-number" data-cell="
|
| 3822 |
-
<a class="line-number" data-cell="
|
| 3823 |
-
<a class="line-number" data-cell="
|
| 3824 |
-
<a class="line-number" data-cell="
|
| 3825 |
-
<a class="line-number" data-cell="
|
| 3826 |
-
<a class="line-number" data-cell="
|
| 3827 |
-
<a class="line-number" data-cell="
|
| 3828 |
-
<a class="line-number" data-cell="
|
| 3829 |
-
<a class="line-number" data-cell="
|
| 3830 |
-
<a class="line-number" data-cell="
|
| 3831 |
-
<a class="line-number" data-cell="
|
| 3832 |
-
<a class="line-number" data-cell="
|
| 3833 |
-
<a class="line-number" data-cell="
|
| 3834 |
-
<a class="line-number" data-cell="
|
| 3835 |
-
<a class="line-number" data-cell="
|
| 3836 |
-
<a class="line-number" data-cell="
|
| 3837 |
-
<a class="line-number" data-cell="
|
| 3838 |
-
<a class="line-number" data-cell="
|
| 3839 |
-
<a class="line-number" data-cell="
|
| 3840 |
-
<a class="line-number" data-cell="
|
| 3841 |
-
<a class="line-number" data-cell="
|
| 3842 |
-
<a class="line-number" data-cell="
|
| 3843 |
-
<a class="line-number" data-cell="
|
| 3844 |
-
<a class="line-number" data-cell="
|
| 3845 |
-
<a class="line-number" data-cell="
|
| 3846 |
-
<a class="line-number" data-cell="
|
| 3847 |
-
<a class="line-number" data-cell="
|
| 3848 |
-
<a class="line-number" data-cell="
|
| 3849 |
-
<a class="line-number" data-cell="
|
| 3850 |
-
<a class="line-number" data-cell="
|
| 3851 |
-
<a class="line-number" data-cell="
|
| 3852 |
-
<a class="line-number" data-cell="setup" data-line="116" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 116, true);">116</a>
|
| 3853 |
</div>
|
| 3854 |
<div class="code-wrap">
|
| 3855 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
@@ -3918,8 +3916,7 @@ Cell: setup | 17.01s | FAILED
|
|
| 3918 |
|
| 3919 |
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssMLP</span><span class="p">,</span> <span class="n">GptOssRMSNorm</span>
|
| 3920 |
|
| 3921 |
-
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">
|
| 3922 |
-
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
| 3923 |
<span class="n">custom_mapping</span> <span class="o">=</span> <span class="p">{</span>
|
| 3924 |
<span class="s2">"Yamoe"</span><span class="p">:</span> <span class="p">{</span>
|
| 3925 |
<span class="s2">"cuda"</span><span class="p">:</span> <span class="p">{</span>
|
|
@@ -3970,11 +3967,11 @@ Cell: setup | 17.01s | FAILED
|
|
| 3970 |
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds"</span><span class="p">)</span>
|
| 3971 |
</pre></div>
|
| 3972 |
|
| 3973 |
-
<div class="code-line-highlight" id="line-highlight-
|
| 3974 |
</div>
|
| 3975 |
</div>
|
| 3976 |
</div>
|
| 3977 |
-
<div id="output-
|
| 3978 |
<div class="cell-stderr">warning: The requested interpreter resolved to Python 3.11.13, which is incompatible with the script's Python requirement: `>=3.12`
|
| 3979 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 3980 |
Updated https://github.com/huggingface/transformers.git (449533af73874470e914a203391635e04ac2ffc8)
|
|
@@ -3991,6 +3988,8 @@ Cell: setup | 17.01s | FAILED
|
|
| 3991 |
</div>
|
| 3992 |
</div>
|
| 3993 |
</div>
|
|
|
|
|
|
|
| 3994 |
</div>
|
| 3995 |
|
| 3996 |
</body>
|
|
|
|
| 3718 |
<h1>Comparison of Megablocks and Yamoe Kernels</h1>
|
| 3719 |
<p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
|
| 3720 |
<h2>Megablocks kernel</h2>
|
| 3721 |
+
<div class="cell cell-failed" id="cell-setup2">
|
|
|
|
| 3722 |
<div class="cell-header">
|
| 3723 |
<span class="collapse-indicators">
|
| 3724 |
+
<span onclick="toggleCode('setup2')" style="cursor: pointer;">▼ code</span>
|
| 3725 |
+
<span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
|
| 3726 |
+
<span id="uv-indicator-setup2" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3727 |
</span> |
|
| 3728 |
+
Cell: setup2 | 16.98s | FAILED
|
| 3729 |
+
| <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
|
| 3730 |
+
<button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
|
| 3731 |
+
<a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
|
| 3732 |
</div>
|
| 3733 |
+
<div id="code-setup2" class="cell-code" data-lines="115">
|
| 3734 |
<div class="highlight-with-lines">
|
| 3735 |
+
<div class="line-numbers" id="lines-setup2">
|
| 3736 |
+
<a class="line-number" data-cell="setup2" data-line="1" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 1, true);">1</a>
|
| 3737 |
+
<a class="line-number" data-cell="setup2" data-line="2" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 2, true);">2</a>
|
| 3738 |
+
<a class="line-number" data-cell="setup2" data-line="3" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 3, true);">3</a>
|
| 3739 |
+
<a class="line-number" data-cell="setup2" data-line="4" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 4, true);">4</a>
|
| 3740 |
+
<a class="line-number" data-cell="setup2" data-line="5" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 5, true);">5</a>
|
| 3741 |
+
<a class="line-number" data-cell="setup2" data-line="6" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 6, true);">6</a>
|
| 3742 |
+
<a class="line-number" data-cell="setup2" data-line="7" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 7, true);">7</a>
|
| 3743 |
+
<a class="line-number" data-cell="setup2" data-line="8" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 8, true);">8</a>
|
| 3744 |
+
<a class="line-number" data-cell="setup2" data-line="9" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 9, true);">9</a>
|
| 3745 |
+
<a class="line-number" data-cell="setup2" data-line="10" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 10, true);">10</a>
|
| 3746 |
+
<a class="line-number" data-cell="setup2" data-line="11" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 11, true);">11</a>
|
| 3747 |
+
<a class="line-number" data-cell="setup2" data-line="12" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 12, true);">12</a>
|
| 3748 |
+
<a class="line-number" data-cell="setup2" data-line="13" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 13, true);">13</a>
|
| 3749 |
+
<a class="line-number" data-cell="setup2" data-line="14" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 14, true);">14</a>
|
| 3750 |
+
<a class="line-number" data-cell="setup2" data-line="15" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 15, true);">15</a>
|
| 3751 |
+
<a class="line-number" data-cell="setup2" data-line="16" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 16, true);">16</a>
|
| 3752 |
+
<a class="line-number" data-cell="setup2" data-line="17" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 17, true);">17</a>
|
| 3753 |
+
<a class="line-number" data-cell="setup2" data-line="18" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 18, true);">18</a>
|
| 3754 |
+
<a class="line-number" data-cell="setup2" data-line="19" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 19, true);">19</a>
|
| 3755 |
+
<a class="line-number" data-cell="setup2" data-line="20" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 20, true);">20</a>
|
| 3756 |
+
<a class="line-number" data-cell="setup2" data-line="21" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 21, true);">21</a>
|
| 3757 |
+
<a class="line-number" data-cell="setup2" data-line="22" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 22, true);">22</a>
|
| 3758 |
+
<a class="line-number" data-cell="setup2" data-line="23" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 23, true);">23</a>
|
| 3759 |
+
<a class="line-number" data-cell="setup2" data-line="24" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 24, true);">24</a>
|
| 3760 |
+
<a class="line-number" data-cell="setup2" data-line="25" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 25, true);">25</a>
|
| 3761 |
+
<a class="line-number" data-cell="setup2" data-line="26" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 26, true);">26</a>
|
| 3762 |
+
<a class="line-number" data-cell="setup2" data-line="27" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 27, true);">27</a>
|
| 3763 |
+
<a class="line-number" data-cell="setup2" data-line="28" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 28, true);">28</a>
|
| 3764 |
+
<a class="line-number" data-cell="setup2" data-line="29" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 29, true);">29</a>
|
| 3765 |
+
<a class="line-number" data-cell="setup2" data-line="30" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 30, true);">30</a>
|
| 3766 |
+
<a class="line-number" data-cell="setup2" data-line="31" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 31, true);">31</a>
|
| 3767 |
+
<a class="line-number" data-cell="setup2" data-line="32" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 32, true);">32</a>
|
| 3768 |
+
<a class="line-number" data-cell="setup2" data-line="33" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 33, true);">33</a>
|
| 3769 |
+
<a class="line-number" data-cell="setup2" data-line="34" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 34, true);">34</a>
|
| 3770 |
+
<a class="line-number" data-cell="setup2" data-line="35" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 35, true);">35</a>
|
| 3771 |
+
<a class="line-number" data-cell="setup2" data-line="36" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 36, true);">36</a>
|
| 3772 |
+
<a class="line-number" data-cell="setup2" data-line="37" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 37, true);">37</a>
|
| 3773 |
+
<a class="line-number" data-cell="setup2" data-line="38" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 38, true);">38</a>
|
| 3774 |
+
<a class="line-number" data-cell="setup2" data-line="39" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 39, true);">39</a>
|
| 3775 |
+
<a class="line-number" data-cell="setup2" data-line="40" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 40, true);">40</a>
|
| 3776 |
+
<a class="line-number" data-cell="setup2" data-line="41" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 41, true);">41</a>
|
| 3777 |
+
<a class="line-number" data-cell="setup2" data-line="42" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 42, true);">42</a>
|
| 3778 |
+
<a class="line-number" data-cell="setup2" data-line="43" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 43, true);">43</a>
|
| 3779 |
+
<a class="line-number" data-cell="setup2" data-line="44" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 44, true);">44</a>
|
| 3780 |
+
<a class="line-number" data-cell="setup2" data-line="45" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 45, true);">45</a>
|
| 3781 |
+
<a class="line-number" data-cell="setup2" data-line="46" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 46, true);">46</a>
|
| 3782 |
+
<a class="line-number" data-cell="setup2" data-line="47" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 47, true);">47</a>
|
| 3783 |
+
<a class="line-number" data-cell="setup2" data-line="48" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 48, true);">48</a>
|
| 3784 |
+
<a class="line-number" data-cell="setup2" data-line="49" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 49, true);">49</a>
|
| 3785 |
+
<a class="line-number" data-cell="setup2" data-line="50" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 50, true);">50</a>
|
| 3786 |
+
<a class="line-number" data-cell="setup2" data-line="51" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 51, true);">51</a>
|
| 3787 |
+
<a class="line-number" data-cell="setup2" data-line="52" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 52, true);">52</a>
|
| 3788 |
+
<a class="line-number" data-cell="setup2" data-line="53" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 53, true);">53</a>
|
| 3789 |
+
<a class="line-number" data-cell="setup2" data-line="54" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 54, true);">54</a>
|
| 3790 |
+
<a class="line-number" data-cell="setup2" data-line="55" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 55, true);">55</a>
|
| 3791 |
+
<a class="line-number" data-cell="setup2" data-line="56" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 56, true);">56</a>
|
| 3792 |
+
<a class="line-number" data-cell="setup2" data-line="57" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 57, true);">57</a>
|
| 3793 |
+
<a class="line-number" data-cell="setup2" data-line="58" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 58, true);">58</a>
|
| 3794 |
+
<a class="line-number" data-cell="setup2" data-line="59" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 59, true);">59</a>
|
| 3795 |
+
<a class="line-number" data-cell="setup2" data-line="60" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 60, true);">60</a>
|
| 3796 |
+
<a class="line-number" data-cell="setup2" data-line="61" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 61, true);">61</a>
|
| 3797 |
+
<a class="line-number" data-cell="setup2" data-line="62" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 62, true);">62</a>
|
| 3798 |
+
<a class="line-number" data-cell="setup2" data-line="63" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 63, true);">63</a>
|
| 3799 |
+
<a class="line-number" data-cell="setup2" data-line="64" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 64, true);">64</a>
|
| 3800 |
+
<a class="line-number" data-cell="setup2" data-line="65" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 65, true);">65</a>
|
| 3801 |
+
<a class="line-number" data-cell="setup2" data-line="66" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 66, true);">66</a>
|
| 3802 |
+
<a class="line-number" data-cell="setup2" data-line="67" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 67, true);">67</a>
|
| 3803 |
+
<a class="line-number" data-cell="setup2" data-line="68" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 68, true);">68</a>
|
| 3804 |
+
<a class="line-number" data-cell="setup2" data-line="69" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 69, true);">69</a>
|
| 3805 |
+
<a class="line-number" data-cell="setup2" data-line="70" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 70, true);">70</a>
|
| 3806 |
+
<a class="line-number" data-cell="setup2" data-line="71" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 71, true);">71</a>
|
| 3807 |
+
<a class="line-number" data-cell="setup2" data-line="72" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 72, true);">72</a>
|
| 3808 |
+
<a class="line-number" data-cell="setup2" data-line="73" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 73, true);">73</a>
|
| 3809 |
+
<a class="line-number" data-cell="setup2" data-line="74" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 74, true);">74</a>
|
| 3810 |
+
<a class="line-number" data-cell="setup2" data-line="75" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 75, true);">75</a>
|
| 3811 |
+
<a class="line-number" data-cell="setup2" data-line="76" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 76, true);">76</a>
|
| 3812 |
+
<a class="line-number" data-cell="setup2" data-line="77" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 77, true);">77</a>
|
| 3813 |
+
<a class="line-number" data-cell="setup2" data-line="78" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 78, true);">78</a>
|
| 3814 |
+
<a class="line-number" data-cell="setup2" data-line="79" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 79, true);">79</a>
|
| 3815 |
+
<a class="line-number" data-cell="setup2" data-line="80" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 80, true);">80</a>
|
| 3816 |
+
<a class="line-number" data-cell="setup2" data-line="81" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 81, true);">81</a>
|
| 3817 |
+
<a class="line-number" data-cell="setup2" data-line="82" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 82, true);">82</a>
|
| 3818 |
+
<a class="line-number" data-cell="setup2" data-line="83" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 83, true);">83</a>
|
| 3819 |
+
<a class="line-number" data-cell="setup2" data-line="84" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 84, true);">84</a>
|
| 3820 |
+
<a class="line-number" data-cell="setup2" data-line="85" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 85, true);">85</a>
|
| 3821 |
+
<a class="line-number" data-cell="setup2" data-line="86" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 86, true);">86</a>
|
| 3822 |
+
<a class="line-number" data-cell="setup2" data-line="87" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 87, true);">87</a>
|
| 3823 |
+
<a class="line-number" data-cell="setup2" data-line="88" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 88, true);">88</a>
|
| 3824 |
+
<a class="line-number" data-cell="setup2" data-line="89" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 89, true);">89</a>
|
| 3825 |
+
<a class="line-number" data-cell="setup2" data-line="90" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 90, true);">90</a>
|
| 3826 |
+
<a class="line-number" data-cell="setup2" data-line="91" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 91, true);">91</a>
|
| 3827 |
+
<a class="line-number" data-cell="setup2" data-line="92" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 92, true);">92</a>
|
| 3828 |
+
<a class="line-number" data-cell="setup2" data-line="93" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 93, true);">93</a>
|
| 3829 |
+
<a class="line-number" data-cell="setup2" data-line="94" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 94, true);">94</a>
|
| 3830 |
+
<a class="line-number" data-cell="setup2" data-line="95" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 95, true);">95</a>
|
| 3831 |
+
<a class="line-number" data-cell="setup2" data-line="96" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 96, true);">96</a>
|
| 3832 |
+
<a class="line-number" data-cell="setup2" data-line="97" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 97, true);">97</a>
|
| 3833 |
+
<a class="line-number" data-cell="setup2" data-line="98" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 98, true);">98</a>
|
| 3834 |
+
<a class="line-number" data-cell="setup2" data-line="99" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 99, true);">99</a>
|
| 3835 |
+
<a class="line-number" data-cell="setup2" data-line="100" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 100, true);">100</a>
|
| 3836 |
+
<a class="line-number" data-cell="setup2" data-line="101" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 101, true);">101</a>
|
| 3837 |
+
<a class="line-number" data-cell="setup2" data-line="102" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 102, true);">102</a>
|
| 3838 |
+
<a class="line-number" data-cell="setup2" data-line="103" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 103, true);">103</a>
|
| 3839 |
+
<a class="line-number" data-cell="setup2" data-line="104" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 104, true);">104</a>
|
| 3840 |
+
<a class="line-number" data-cell="setup2" data-line="105" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 105, true);">105</a>
|
| 3841 |
+
<a class="line-number" data-cell="setup2" data-line="106" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 106, true);">106</a>
|
| 3842 |
+
<a class="line-number" data-cell="setup2" data-line="107" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 107, true);">107</a>
|
| 3843 |
+
<a class="line-number" data-cell="setup2" data-line="108" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 108, true);">108</a>
|
| 3844 |
+
<a class="line-number" data-cell="setup2" data-line="109" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 109, true);">109</a>
|
| 3845 |
+
<a class="line-number" data-cell="setup2" data-line="110" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 110, true);">110</a>
|
| 3846 |
+
<a class="line-number" data-cell="setup2" data-line="111" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 111, true);">111</a>
|
| 3847 |
+
<a class="line-number" data-cell="setup2" data-line="112" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 112, true);">112</a>
|
| 3848 |
+
<a class="line-number" data-cell="setup2" data-line="113" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 113, true);">113</a>
|
| 3849 |
+
<a class="line-number" data-cell="setup2" data-line="114" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 114, true);">114</a>
|
| 3850 |
+
<a class="line-number" data-cell="setup2" data-line="115" href="#cell-setup2" onclick="event.preventDefault(); selectCellLine('setup2', 115, true);">115</a>
|
|
|
|
| 3851 |
</div>
|
| 3852 |
<div class="code-wrap">
|
| 3853 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
|
|
| 3916 |
|
| 3917 |
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssMLP</span><span class="p">,</span> <span class="n">GptOssRMSNorm</span>
|
| 3918 |
|
| 3919 |
+
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> <span class="c1"># direct, type-safe</span>
|
|
|
|
| 3920 |
<span class="n">custom_mapping</span> <span class="o">=</span> <span class="p">{</span>
|
| 3921 |
<span class="s2">"Yamoe"</span><span class="p">:</span> <span class="p">{</span>
|
| 3922 |
<span class="s2">"cuda"</span><span class="p">:</span> <span class="p">{</span>
|
|
|
|
| 3967 |
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds"</span><span class="p">)</span>
|
| 3968 |
</pre></div>
|
| 3969 |
|
| 3970 |
+
<div class="code-line-highlight" id="line-highlight-setup2"></div>
|
| 3971 |
</div>
|
| 3972 |
</div>
|
| 3973 |
</div>
|
| 3974 |
+
<div id="output-setup2" class="cell-output">
|
| 3975 |
<div class="cell-stderr">warning: The requested interpreter resolved to Python 3.11.13, which is incompatible with the script's Python requirement: `>=3.12`
|
| 3976 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 3977 |
Updated https://github.com/huggingface/transformers.git (449533af73874470e914a203391635e04ac2ffc8)
|
|
|
|
| 3988 |
</div>
|
| 3989 |
</div>
|
| 3990 |
</div>
|
| 3991 |
+
|
| 3992 |
+
<h2>Yamoe Kernel</h2>
|
| 3993 |
</div>
|
| 3994 |
|
| 3995 |
</body>
|
moe_benchmarks/megablocks_yamoe/torch_profile.html
CHANGED
|
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
-
Cell: utils | deps: torch, numpy |
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3795,22 +3795,22 @@ Cell: utils | deps: torch, numpy | 35.49s
|
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
| 3797 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 3798 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3799 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3800 |
-
Downloading
|
| 3801 |
-
Downloading
|
| 3802 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3803 |
-
Downloading torch (846.9MiB)
|
| 3804 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3805 |
Downloading setuptools (1.1MiB)
|
| 3806 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3807 |
-
Downloading
|
| 3808 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3809 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 3810 |
Downloading numpy (16.2MiB)
|
|
|
|
| 3811 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3812 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3813 |
-
Downloading
|
|
|
|
| 3814 |
Downloading triton (148.3MiB)
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
|
@@ -3830,7 +3830,7 @@ Downloading triton (148.3MiB)
|
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
-
Installed 26 packages in
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
@@ -3843,7 +3843,7 @@ Installed 26 packages in 461ms
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: bench_utils | deps: torch, numpy | 34.
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 34.17s
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
-
Downloading
|
| 4335 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4336 |
-
Downloading sympy (6.0MiB)
|
| 4337 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4338 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4339 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4340 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4341 |
-
Downloading networkx (1.9MiB)
|
| 4342 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4343 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4344 |
-
Downloading torch (846.9MiB)
|
| 4345 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4346 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4347 |
-
Downloading numpy (16.2MiB)
|
| 4348 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
| 4349 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4350 |
-
Downloading nvidia-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4351 |
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
@@ -4367,7 +4367,7 @@ Downloading triton (148.3MiB)
|
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
| 4369 |
Downloading torch
|
| 4370 |
-
Installed 26 packages in
|
| 4371 |
</div>
|
| 4372 |
</div>
|
| 4373 |
</div>
|
|
@@ -4381,7 +4381,7 @@ Installed 26 packages in 507ms
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
-
Cell: config | deps: torch, numpy |
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4441,23 +4441,23 @@ Cell: config | deps: torch, numpy | 34.91s
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
-
Downloading
|
| 4445 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4446 |
-
Downloading setuptools (1.1MiB)
|
| 4447 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4448 |
-
Downloading
|
| 4449 |
-
Downloading
|
| 4450 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4451 |
-
Downloading numpy (16.2MiB)
|
| 4452 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4453 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4454 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4455 |
-
Downloading nvidia-
|
| 4456 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4457 |
Downloading networkx (1.9MiB)
|
|
|
|
| 4458 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4459 |
-
Downloading nvidia-cuda-
|
| 4460 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4461 |
Downloading triton (148.3MiB)
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
|
@@ -4474,10 +4474,10 @@ Downloading triton (148.3MiB)
|
|
| 4474 |
Downloading nvidia-cusparselt-cu12
|
| 4475 |
Downloading nvidia-cusparse-cu12
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
-
Downloading nvidia-cudnn-cu12
|
| 4478 |
Downloading nvidia-cublas-cu12
|
|
|
|
| 4479 |
Downloading torch
|
| 4480 |
-
Installed 26 packages in
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
@@ -4490,7 +4490,7 @@ Installed 26 packages in 572ms
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
-
Cell: save_data | deps: torch, numpy |
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4589 |
Downloading sympy (6.0MiB)
|
| 4590 |
-
Downloading nvidia-
|
| 4591 |
-
Downloading triton (148.3MiB)
|
| 4592 |
Downloading numpy (16.2MiB)
|
| 4593 |
-
Downloading networkx (1.9MiB)
|
| 4594 |
Downloading torch (846.9MiB)
|
| 4595 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 4596 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4597 |
-
Downloading nvidia-
|
| 4598 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4599 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4600 |
Downloading setuptools (1.1MiB)
|
|
|
|
| 4601 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4602 |
-
Downloading nvidia-
|
| 4603 |
-
Downloading
|
| 4604 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4605 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
@@ -4621,17 +4621,17 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
-
Installed 26 packages in
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
-
<a href="artifacts/save_data/
|
| 4630 |
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4631 |
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4632 |
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4633 |
-
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4634 |
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
|
|
|
| 4635 |
</div>
|
| 4636 |
</div>
|
| 4637 |
</div>
|
|
@@ -4645,7 +4645,7 @@ Installed 26 packages in 455ms
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
-
Cell: yamoe_run | deps: torch, kernels, numpy | 38.
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4938,10 +4938,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
-
Progress: 20% complete (avg: 4.
|
| 4942 |
-
Progress: 40% complete (avg: 4.
|
| 4943 |
Progress: 60% complete (avg: 4.248 ms)
|
| 4944 |
-
Progress: 80% complete (avg: 4.
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -4951,19 +4951,19 @@ Output tensors:
|
|
| 4951 |
Iterations: 50
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
-
Average: 4.
|
| 4955 |
-
Min: 4.
|
| 4956 |
-
Max: 4.
|
| 4957 |
-
Std Dev: 0.
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
-
P50 (median): 4.
|
| 4961 |
-
P95: 4.
|
| 4962 |
-
P99: 4.
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
-
Tokens/sec:
|
| 4966 |
-
Std Dev:
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
|
@@ -4973,25 +4973,25 @@ Output sum: 3.971905
|
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
-
Downloading nvidia-
|
| 4977 |
-
Downloading numpy (16.2MiB)
|
| 4978 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4979 |
-
Downloading hf-xet (3.0MiB)
|
| 4980 |
-
Downloading networkx (1.9MiB)
|
| 4981 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4982 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
| 4983 |
Downloading sympy (6.0MiB)
|
| 4984 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4985 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4986 |
-
Downloading
|
| 4987 |
-
Downloading nvidia-
|
|
|
|
| 4988 |
Downloading triton (148.3MiB)
|
|
|
|
| 4989 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4990 |
-
Downloading
|
| 4991 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4992 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4993 |
-
Downloading nvidia-
|
| 4994 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
@@ -5011,14 +5011,14 @@ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
-
Installed 37 packages in
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
-
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 3.
|
| 5019 |
-
Fetching 6 files: 33%|███▎ | 2/6 [00:00<00:01, 3.
|
| 5020 |
-
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:
|
| 5021 |
-
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00,
|
| 5022 |
<div class="cell-artifacts">
|
| 5023 |
<h4>Artifacts:</h4>
|
| 5024 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
@@ -5035,7 +5035,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 6.1
|
|
| 5035 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5036 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5037 |
</span> |
|
| 5038 |
-
Cell: binned_run | deps: torch, numpy | 39.
|
| 5039 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5040 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5041 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5449,10 +5449,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5449 |
|
| 5450 |
Warming up (10 iterations)...
|
| 5451 |
Benchmarking (50 iterations)...
|
| 5452 |
-
Progress: 20% complete (avg: 38.
|
| 5453 |
-
Progress: 40% complete (avg:
|
| 5454 |
-
Progress: 60% complete (avg: 37.
|
| 5455 |
-
Progress: 80% complete (avg: 37.
|
| 5456 |
|
| 5457 |
Output tensors:
|
| 5458 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -5462,19 +5462,19 @@ Output tensors:
|
|
| 5462 |
Iterations: 50
|
| 5463 |
|
| 5464 |
Latency Statistics:
|
| 5465 |
-
Average: 36.
|
| 5466 |
-
Min:
|
| 5467 |
-
Max:
|
| 5468 |
-
Std Dev: 1.
|
| 5469 |
|
| 5470 |
Percentiles:
|
| 5471 |
-
P50 (median): 36.
|
| 5472 |
-
P95:
|
| 5473 |
-
P99:
|
| 5474 |
|
| 5475 |
Throughput:
|
| 5476 |
-
Tokens/sec:
|
| 5477 |
-
Std Dev:
|
| 5478 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5479 |
|
| 5480 |
Saved benchmark results to binned_results.json
|
|
@@ -5484,24 +5484,24 @@ Output sum: 3.971905
|
|
| 5484 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5485 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5486 |
<div class="uv-logs-content" style="display: none;">
|
| 5487 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5488 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5489 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5490 |
-
Downloading setuptools (1.1MiB)
|
| 5491 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5492 |
Downloading networkx (1.9MiB)
|
| 5493 |
-
Downloading
|
| 5494 |
-
Downloading nvidia-
|
| 5495 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5496 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5497 |
-
Downloading
|
| 5498 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5499 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5500 |
-
Downloading sympy (6.0MiB)
|
| 5501 |
-
Downloading triton (148.3MiB)
|
| 5502 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
|
|
|
| 5503 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5504 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5505 |
Downloading nvidia-cufile-cu12
|
| 5506 |
Downloading setuptools
|
| 5507 |
Downloading networkx
|
|
@@ -5520,7 +5520,7 @@ Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
| 5520 |
Downloading nvidia-cublas-cu12
|
| 5521 |
Downloading nvidia-cudnn-cu12
|
| 5522 |
Downloading torch
|
| 5523 |
-
Installed 26 packages in
|
| 5524 |
</div>
|
| 5525 |
</div>
|
| 5526 |
<div class="cell-artifacts">
|
|
@@ -5539,7 +5539,7 @@ Installed 26 packages in 442ms
|
|
| 5539 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5540 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5541 |
</span> |
|
| 5542 |
-
Cell: gptoss_run | deps: torch, numpy | 39.
|
| 5543 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5544 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5545 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5857,10 +5857,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5857 |
|
| 5858 |
Warming up (10 iterations)...
|
| 5859 |
Benchmarking (50 iterations)...
|
| 5860 |
-
Progress: 20% complete (avg:
|
| 5861 |
-
Progress: 40% complete (avg:
|
| 5862 |
-
Progress: 60% complete (avg: 47.
|
| 5863 |
-
Progress: 80% complete (avg: 46.
|
| 5864 |
|
| 5865 |
Output tensors:
|
| 5866 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -5870,19 +5870,19 @@ Output tensors:
|
|
| 5870 |
Iterations: 50
|
| 5871 |
|
| 5872 |
Latency Statistics:
|
| 5873 |
-
Average: 45.
|
| 5874 |
-
Min:
|
| 5875 |
-
Max:
|
| 5876 |
-
Std Dev:
|
| 5877 |
|
| 5878 |
Percentiles:
|
| 5879 |
-
P50 (median):
|
| 5880 |
-
P95:
|
| 5881 |
-
P99:
|
| 5882 |
|
| 5883 |
Throughput:
|
| 5884 |
-
Tokens/sec:
|
| 5885 |
-
Std Dev:
|
| 5886 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5887 |
|
| 5888 |
Saved benchmark results to gptoss_results.json
|
|
@@ -5892,24 +5892,24 @@ Output sum: 11.532237
|
|
| 5892 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5893 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5894 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 5895 |
Downloading numpy (16.2MiB)
|
| 5896 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5897 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5898 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5899 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5900 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5901 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5902 |
-
Downloading networkx (1.9MiB)
|
| 5903 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
| 5904 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5905 |
-
Downloading nvidia-
|
| 5906 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5907 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5908 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5909 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5910 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5911 |
Downloading torch (846.9MiB)
|
| 5912 |
-
Downloading triton (148.3MiB)
|
| 5913 |
Downloading nvidia-cufile-cu12
|
| 5914 |
Downloading setuptools
|
| 5915 |
Downloading networkx
|
|
@@ -5947,7 +5947,7 @@ Installed 26 packages in 443ms
|
|
| 5947 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5948 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5949 |
</span> |
|
| 5950 |
-
Cell: gptoss_training_run | deps: torch, numpy |
|
| 5951 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5952 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5953 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6248,10 +6248,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6248 |
|
| 6249 |
Warming up (10 iterations)...
|
| 6250 |
Benchmarking (50 iterations)...
|
| 6251 |
-
Progress: 20% complete (avg:
|
| 6252 |
-
Progress: 40% complete (avg:
|
| 6253 |
-
Progress: 60% complete (avg:
|
| 6254 |
-
Progress: 80% complete (avg:
|
| 6255 |
|
| 6256 |
Output tensors:
|
| 6257 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -6261,19 +6261,19 @@ Output tensors:
|
|
| 6261 |
Iterations: 50
|
| 6262 |
|
| 6263 |
Latency Statistics:
|
| 6264 |
-
Average:
|
| 6265 |
-
Min:
|
| 6266 |
-
Max:
|
| 6267 |
-
Std Dev:
|
| 6268 |
|
| 6269 |
Percentiles:
|
| 6270 |
-
P50 (median):
|
| 6271 |
-
P95:
|
| 6272 |
-
P99:
|
| 6273 |
|
| 6274 |
Throughput:
|
| 6275 |
-
Tokens/sec:
|
| 6276 |
-
Std Dev:
|
| 6277 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6278 |
|
| 6279 |
Saved benchmark results to gptoss_training_results.json
|
|
@@ -6283,30 +6283,30 @@ Output sum: 11.532237
|
|
| 6283 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6284 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6285 |
<div class="uv-logs-content" style="display: none;">
|
| 6286 |
-
Downloading
|
| 6287 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6288 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6289 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6290 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6291 |
Downloading numpy (16.2MiB)
|
| 6292 |
-
Downloading
|
| 6293 |
Downloading networkx (1.9MiB)
|
| 6294 |
Downloading setuptools (1.1MiB)
|
| 6295 |
-
Downloading nvidia-
|
|
|
|
| 6296 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6297 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6298 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6299 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 6300 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6301 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 6302 |
Downloading triton (148.3MiB)
|
| 6303 |
-
Downloading
|
| 6304 |
Downloading nvidia-cufile-cu12
|
| 6305 |
Downloading setuptools
|
| 6306 |
Downloading networkx
|
| 6307 |
Downloading nvidia-cuda-cupti-cu12
|
| 6308 |
-
Downloading numpy
|
| 6309 |
Downloading sympy
|
|
|
|
| 6310 |
Downloading nvidia-nvjitlink-cu12
|
| 6311 |
Downloading nvidia-curand-cu12
|
| 6312 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
@@ -6319,7 +6319,7 @@ Downloading torch (846.9MiB)
|
|
| 6319 |
Downloading nvidia-cublas-cu12
|
| 6320 |
Downloading nvidia-cudnn-cu12
|
| 6321 |
Downloading torch
|
| 6322 |
-
Installed 26 packages in
|
| 6323 |
</div>
|
| 6324 |
</div>
|
| 6325 |
<div class="cell-artifacts">
|
|
@@ -6338,7 +6338,7 @@ Installed 26 packages in 544ms
|
|
| 6338 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6339 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6340 |
</span> |
|
| 6341 |
-
Cell: megablocks_run | deps: torch, numpy, kernels | 47.
|
| 6342 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6343 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6344 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6567,10 +6567,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6567 |
|
| 6568 |
Warming up (10 iterations)...
|
| 6569 |
Benchmarking (50 iterations)...
|
| 6570 |
-
Progress: 20% complete (avg: 0.
|
| 6571 |
-
Progress: 40% complete (avg: 0.
|
| 6572 |
-
Progress: 60% complete (avg: 0.
|
| 6573 |
-
Progress: 80% complete (avg: 2.
|
| 6574 |
|
| 6575 |
Output tensors:
|
| 6576 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
|
|
@@ -6580,19 +6580,19 @@ Output tensors:
|
|
| 6580 |
Iterations: 50
|
| 6581 |
|
| 6582 |
Latency Statistics:
|
| 6583 |
-
Average: 3.
|
| 6584 |
-
Min: 0.
|
| 6585 |
-
Max: 8.
|
| 6586 |
-
Std Dev: 3.
|
| 6587 |
|
| 6588 |
Percentiles:
|
| 6589 |
-
P50 (median): 0.
|
| 6590 |
-
P95: 8.
|
| 6591 |
-
P99: 8.
|
| 6592 |
|
| 6593 |
Throughput:
|
| 6594 |
-
Tokens/sec:
|
| 6595 |
-
Std Dev:
|
| 6596 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6597 |
|
| 6598 |
Saved benchmark results to megablocks_results.json
|
|
@@ -6602,25 +6602,25 @@ Output sum: 6.473885
|
|
| 6602 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6603 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6604 |
<div class="uv-logs-content" style="display: none;">
|
| 6605 |
-
Downloading
|
| 6606 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 6607 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6608 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6609 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
| 6610 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6611 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6612 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6613 |
Downloading triton (148.3MiB)
|
| 6614 |
Downloading networkx (1.9MiB)
|
| 6615 |
Downloading hf-xet (3.0MiB)
|
| 6616 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6617 |
-
Downloading setuptools (1.1MiB)
|
| 6618 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6619 |
-
Downloading sympy (6.0MiB)
|
| 6620 |
-
Downloading torch (846.9MiB)
|
| 6621 |
Downloading numpy (16.2MiB)
|
| 6622 |
-
Downloading nvidia-
|
|
|
|
| 6623 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
| 6624 |
Downloading nvidia-cufile-cu12
|
| 6625 |
Downloading hf-xet
|
| 6626 |
Downloading setuptools
|
|
@@ -6640,19 +6640,19 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
| 6640 |
Downloading nvidia-cublas-cu12
|
| 6641 |
Downloading nvidia-cudnn-cu12
|
| 6642 |
Downloading torch
|
| 6643 |
-
Installed 37 packages in
|
| 6644 |
</div>
|
| 6645 |
</div>
|
| 6646 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6647 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:
|
| 6648 |
-
Fetching 66 files:
|
| 6649 |
-
Fetching 66 files:
|
| 6650 |
-
Fetching 66 files:
|
| 6651 |
-
Fetching 66 files:
|
| 6652 |
-
Fetching 66 files:
|
| 6653 |
-
Fetching 66 files:
|
| 6654 |
-
Fetching 66 files:
|
| 6655 |
-
Fetching 66 files: 100%|██████████| 66/66 [00:02<00:00,
|
| 6656 |
<div class="cell-artifacts">
|
| 6657 |
<h4>Artifacts:</h4>
|
| 6658 |
<a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
|
|
@@ -6669,7 +6669,7 @@ Fetching 66 files: 100%|██████████| 66/66 [00:02<00:00, 3
|
|
| 6669 |
<span onclick="toggleOutput('visualization')" style="cursor: pointer;">▼ output</span>
|
| 6670 |
<span id="uv-indicator-visualization" onclick="toggleUvLogsFromHeader('visualization')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6671 |
</span> |
|
| 6672 |
-
Cell: visualization | deps: matplotlib | 3.
|
| 6673 |
| <button class="run-btn" onclick="runCell('visualization')">▶ run</button>
|
| 6674 |
<button class="copy-btn" onclick="copyCell('visualization')">Copy</button>
|
| 6675 |
<a href="cells/visualization.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6915,30 +6915,30 @@ Loaded /repo/moe_benchmarks/megablocks_yamoe/.uvnote/cache/0febdf3420999533bc2e1
|
|
| 6915 |
Performance Summary:
|
| 6916 |
Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
|
| 6917 |
--------------------------------------------------------------------------------
|
| 6918 |
-
megablocks_results 3.
|
| 6919 |
-
yamoe_results 4.25 4.
|
| 6920 |
-
binned_results 36.
|
| 6921 |
-
gptoss_results 45.
|
| 6922 |
-
gptoss_training_results
|
| 6923 |
-
|
| 6924 |
-
Fastest: megablocks_results (3.
|
| 6925 |
-
Slowest: gptoss_training_results (
|
| 6926 |
-
Max Speedup:
|
| 6927 |
</div>
|
| 6928 |
<div class="uv-install-logs" id="uv-logs-visualization">
|
| 6929 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6930 |
<div class="uv-logs-content" style="display: none;">
|
| 6931 |
-
Downloading pillow (6.3MiB)
|
| 6932 |
-
Downloading kiwisolver (1.4MiB)
|
| 6933 |
-
Downloading numpy (16.2MiB)
|
| 6934 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
| 6935 |
Downloading fonttools (4.7MiB)
|
|
|
|
| 6936 |
Downloading kiwisolver
|
| 6937 |
Downloading pillow
|
| 6938 |
Downloading fonttools
|
| 6939 |
Downloading matplotlib
|
| 6940 |
Downloading numpy
|
| 6941 |
-
Installed 11 packages in
|
| 6942 |
</div>
|
| 6943 |
</div>
|
| 6944 |
<div class="cell-artifacts">
|
|
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
+
Cell: utils | deps: torch, numpy | 34.73s
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
| 3797 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3798 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3799 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3800 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3801 |
+
Downloading networkx (1.9MiB)
|
| 3802 |
+
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
| 3803 |
Downloading setuptools (1.1MiB)
|
| 3804 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3805 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
| 3806 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3807 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3808 |
Downloading numpy (16.2MiB)
|
| 3809 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3810 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3811 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3812 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3813 |
+
Downloading torch (846.9MiB)
|
| 3814 |
Downloading triton (148.3MiB)
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
|
|
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
+
Installed 26 packages in 456ms
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: bench_utils | deps: torch, numpy | 34.06s
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
+
Downloading networkx (1.9MiB)
|
| 4335 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
| 4336 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4337 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4338 |
+
Downloading setuptools (1.1MiB)
|
| 4339 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4340 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4341 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4342 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4343 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4344 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4345 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4346 |
+
Downloading sympy (6.0MiB)
|
| 4347 |
Downloading triton (148.3MiB)
|
| 4348 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4349 |
+
Downloading numpy (16.2MiB)
|
| 4350 |
+
Downloading torch (846.9MiB)
|
| 4351 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
|
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
| 4369 |
Downloading torch
|
| 4370 |
+
Installed 26 packages in 448ms
|
| 4371 |
</div>
|
| 4372 |
</div>
|
| 4373 |
</div>
|
|
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
+
Cell: config | deps: torch, numpy | 35.66s
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
+
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
| 4445 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4446 |
+
Downloading sympy (6.0MiB)
|
| 4447 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4448 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 4449 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4450 |
+
Downloading setuptools (1.1MiB)
|
| 4451 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4452 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4453 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4454 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4455 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4456 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 4457 |
Downloading networkx (1.9MiB)
|
| 4458 |
+
Downloading torch (846.9MiB)
|
| 4459 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4460 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 4461 |
Downloading triton (148.3MiB)
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
|
|
|
| 4474 |
Downloading nvidia-cusparselt-cu12
|
| 4475 |
Downloading nvidia-cusparse-cu12
|
| 4476 |
Downloading nvidia-nccl-cu12
|
|
|
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
+
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
+
Installed 26 packages in 456ms
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
+
Cell: save_data | deps: torch, numpy | 38.92s
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
| 4588 |
Downloading sympy (6.0MiB)
|
| 4589 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
| 4590 |
Downloading numpy (16.2MiB)
|
|
|
|
| 4591 |
Downloading torch (846.9MiB)
|
| 4592 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4593 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4594 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4595 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4596 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 4597 |
Downloading setuptools (1.1MiB)
|
| 4598 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4599 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4600 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4601 |
+
Downloading networkx (1.9MiB)
|
| 4602 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4603 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4604 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4605 |
+
Downloading triton (148.3MiB)
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
+
Installed 26 packages in 453ms
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
+
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4630 |
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4631 |
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4632 |
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
|
|
|
| 4633 |
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4634 |
+
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
| 4635 |
</div>
|
| 4636 |
</div>
|
| 4637 |
</div>
|
|
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
+
Cell: yamoe_run | deps: torch, kernels, numpy | 38.89s
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
+
Progress: 20% complete (avg: 4.249 ms)
|
| 4942 |
+
Progress: 40% complete (avg: 4.247 ms)
|
| 4943 |
Progress: 60% complete (avg: 4.248 ms)
|
| 4944 |
+
Progress: 80% complete (avg: 4.248 ms)
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 4951 |
Iterations: 50
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
+
Average: 4.249 ms
|
| 4955 |
+
Min: 4.132 ms
|
| 4956 |
+
Max: 4.278 ms
|
| 4957 |
+
Std Dev: 0.021 ms
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
+
P50 (median): 4.253 ms
|
| 4961 |
+
P95: 4.263 ms
|
| 4962 |
+
P99: 4.272 ms
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
+
Tokens/sec: 23537.0
|
| 4966 |
+
Std Dev: 116.3
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
|
|
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4977 |
Downloading setuptools (1.1MiB)
|
| 4978 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4979 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4980 |
Downloading sympy (6.0MiB)
|
|
|
|
| 4981 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4982 |
+
Downloading networkx (1.9MiB)
|
| 4983 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4984 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4985 |
Downloading triton (148.3MiB)
|
| 4986 |
+
Downloading hf-xet (3.0MiB)
|
| 4987 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4988 |
+
Downloading numpy (16.2MiB)
|
|
|
|
| 4989 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4990 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4991 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4992 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4993 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4994 |
+
Downloading torch (846.9MiB)
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
|
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
+
Installed 37 packages in 450ms
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 3.63it/s]
|
| 5019 |
+
Fetching 6 files: 33%|███▎ | 2/6 [00:00<00:01, 3.84it/s]
|
| 5020 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 3.44it/s]
|
| 5021 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 7.04it/s]</div>
|
| 5022 |
<div class="cell-artifacts">
|
| 5023 |
<h4>Artifacts:</h4>
|
| 5024 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
|
|
| 5035 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5036 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5037 |
</span> |
|
| 5038 |
+
Cell: binned_run | deps: torch, numpy | 39.04s
|
| 5039 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5040 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5041 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5449 |
|
| 5450 |
Warming up (10 iterations)...
|
| 5451 |
Benchmarking (50 iterations)...
|
| 5452 |
+
Progress: 20% complete (avg: 38.510 ms)
|
| 5453 |
+
Progress: 40% complete (avg: 38.423 ms)
|
| 5454 |
+
Progress: 60% complete (avg: 37.889 ms)
|
| 5455 |
+
Progress: 80% complete (avg: 37.386 ms)
|
| 5456 |
|
| 5457 |
Output tensors:
|
| 5458 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 5462 |
Iterations: 50
|
| 5463 |
|
| 5464 |
Latency Statistics:
|
| 5465 |
+
Average: 36.727 ms
|
| 5466 |
+
Min: 33.185 ms
|
| 5467 |
+
Max: 39.167 ms
|
| 5468 |
+
Std Dev: 1.747 ms
|
| 5469 |
|
| 5470 |
Percentiles:
|
| 5471 |
+
P50 (median): 36.949 ms
|
| 5472 |
+
P95: 38.664 ms
|
| 5473 |
+
P99: 38.973 ms
|
| 5474 |
|
| 5475 |
Throughput:
|
| 5476 |
+
Tokens/sec: 2722.8
|
| 5477 |
+
Std Dev: 133.0
|
| 5478 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5479 |
|
| 5480 |
Saved benchmark results to binned_results.json
|
|
|
|
| 5484 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5485 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5486 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5487 |
Downloading networkx (1.9MiB)
|
| 5488 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5489 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 5490 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5491 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5492 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
| 5493 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5494 |
+
Downloading numpy (16.2MiB)
|
| 5495 |
+
Downloading setuptools (1.1MiB)
|
| 5496 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5497 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5498 |
+
Downloading sympy (6.0MiB)
|
| 5499 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5500 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5501 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5502 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5503 |
+
Downloading triton (148.3MiB)
|
| 5504 |
+
Downloading torch (846.9MiB)
|
| 5505 |
Downloading nvidia-cufile-cu12
|
| 5506 |
Downloading setuptools
|
| 5507 |
Downloading networkx
|
|
|
|
| 5520 |
Downloading nvidia-cublas-cu12
|
| 5521 |
Downloading nvidia-cudnn-cu12
|
| 5522 |
Downloading torch
|
| 5523 |
+
Installed 26 packages in 450ms
|
| 5524 |
</div>
|
| 5525 |
</div>
|
| 5526 |
<div class="cell-artifacts">
|
|
|
|
| 5539 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5540 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5541 |
</span> |
|
| 5542 |
+
Cell: gptoss_run | deps: torch, numpy | 39.34s
|
| 5543 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5544 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5545 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5857 |
|
| 5858 |
Warming up (10 iterations)...
|
| 5859 |
Benchmarking (50 iterations)...
|
| 5860 |
+
Progress: 20% complete (avg: 50.553 ms)
|
| 5861 |
+
Progress: 40% complete (avg: 49.257 ms)
|
| 5862 |
+
Progress: 60% complete (avg: 47.930 ms)
|
| 5863 |
+
Progress: 80% complete (avg: 46.616 ms)
|
| 5864 |
|
| 5865 |
Output tensors:
|
| 5866 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 5870 |
Iterations: 50
|
| 5871 |
|
| 5872 |
Latency Statistics:
|
| 5873 |
+
Average: 45.423 ms
|
| 5874 |
+
Min: 38.786 ms
|
| 5875 |
+
Max: 51.683 ms
|
| 5876 |
+
Std Dev: 3.671 ms
|
| 5877 |
|
| 5878 |
Percentiles:
|
| 5879 |
+
P50 (median): 45.482 ms
|
| 5880 |
+
P95: 50.887 ms
|
| 5881 |
+
P99: 51.415 ms
|
| 5882 |
|
| 5883 |
Throughput:
|
| 5884 |
+
Tokens/sec: 2201.5
|
| 5885 |
+
Std Dev: 179.5
|
| 5886 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5887 |
|
| 5888 |
Saved benchmark results to gptoss_results.json
|
|
|
|
| 5892 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5893 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5894 |
<div class="uv-logs-content" style="display: none;">
|
| 5895 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5896 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5897 |
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5898 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
| 5899 |
Downloading setuptools (1.1MiB)
|
| 5900 |
+
Downloading triton (148.3MiB)
|
| 5901 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5902 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5903 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
| 5904 |
Downloading sympy (6.0MiB)
|
| 5905 |
+
Downloading networkx (1.9MiB)
|
| 5906 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5907 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5908 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5909 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5910 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5911 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 5912 |
Downloading torch (846.9MiB)
|
|
|
|
| 5913 |
Downloading nvidia-cufile-cu12
|
| 5914 |
Downloading setuptools
|
| 5915 |
Downloading networkx
|
|
|
|
| 5947 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5948 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5949 |
</span> |
|
| 5950 |
+
Cell: gptoss_training_run | deps: torch, numpy | 40.47s
|
| 5951 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5952 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5953 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6248 |
|
| 6249 |
Warming up (10 iterations)...
|
| 6250 |
Benchmarking (50 iterations)...
|
| 6251 |
+
Progress: 20% complete (avg: 50.806 ms)
|
| 6252 |
+
Progress: 40% complete (avg: 50.412 ms)
|
| 6253 |
+
Progress: 60% complete (avg: 49.460 ms)
|
| 6254 |
+
Progress: 80% complete (avg: 48.277 ms)
|
| 6255 |
|
| 6256 |
Output tensors:
|
| 6257 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 6261 |
Iterations: 50
|
| 6262 |
|
| 6263 |
Latency Statistics:
|
| 6264 |
+
Average: 47.223 ms
|
| 6265 |
+
Min: 40.875 ms
|
| 6266 |
+
Max: 51.398 ms
|
| 6267 |
+
Std Dev: 3.111 ms
|
| 6268 |
|
| 6269 |
Percentiles:
|
| 6270 |
+
P50 (median): 47.684 ms
|
| 6271 |
+
P95: 51.259 ms
|
| 6272 |
+
P99: 51.362 ms
|
| 6273 |
|
| 6274 |
Throughput:
|
| 6275 |
+
Tokens/sec: 2117.6
|
| 6276 |
+
Std Dev: 143.6
|
| 6277 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6278 |
|
| 6279 |
Saved benchmark results to gptoss_training_results.json
|
|
|
|
| 6283 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6284 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6285 |
<div class="uv-logs-content" style="display: none;">
|
| 6286 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6287 |
Downloading numpy (16.2MiB)
|
| 6288 |
+
Downloading sympy (6.0MiB)
|
| 6289 |
Downloading networkx (1.9MiB)
|
| 6290 |
Downloading setuptools (1.1MiB)
|
| 6291 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6292 |
+
Downloading torch (846.9MiB)
|
| 6293 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6294 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6295 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6296 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6297 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6298 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6299 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6300 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6301 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6302 |
Downloading triton (148.3MiB)
|
| 6303 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6304 |
Downloading nvidia-cufile-cu12
|
| 6305 |
Downloading setuptools
|
| 6306 |
Downloading networkx
|
| 6307 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 6308 |
Downloading sympy
|
| 6309 |
+
Downloading numpy
|
| 6310 |
Downloading nvidia-nvjitlink-cu12
|
| 6311 |
Downloading nvidia-curand-cu12
|
| 6312 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
|
|
| 6319 |
Downloading nvidia-cublas-cu12
|
| 6320 |
Downloading nvidia-cudnn-cu12
|
| 6321 |
Downloading torch
|
| 6322 |
+
Installed 26 packages in 451ms
|
| 6323 |
</div>
|
| 6324 |
</div>
|
| 6325 |
<div class="cell-artifacts">
|
|
|
|
| 6338 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6339 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6340 |
</span> |
|
| 6341 |
+
Cell: megablocks_run | deps: torch, numpy, kernels | 47.67s
|
| 6342 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6343 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6344 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6567 |
|
| 6568 |
Warming up (10 iterations)...
|
| 6569 |
Benchmarking (50 iterations)...
|
| 6570 |
+
Progress: 20% complete (avg: 0.875 ms)
|
| 6571 |
+
Progress: 40% complete (avg: 0.853 ms)
|
| 6572 |
+
Progress: 60% complete (avg: 0.859 ms)
|
| 6573 |
+
Progress: 80% complete (avg: 2.680 ms)
|
| 6574 |
|
| 6575 |
Output tensors:
|
| 6576 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
|
|
|
|
| 6580 |
Iterations: 50
|
| 6581 |
|
| 6582 |
Latency Statistics:
|
| 6583 |
+
Average: 3.830 ms
|
| 6584 |
+
Min: 0.819 ms
|
| 6585 |
+
Max: 8.440 ms
|
| 6586 |
+
Std Dev: 3.659 ms
|
| 6587 |
|
| 6588 |
Percentiles:
|
| 6589 |
+
P50 (median): 0.886 ms
|
| 6590 |
+
P95: 8.437 ms
|
| 6591 |
+
P99: 8.439 ms
|
| 6592 |
|
| 6593 |
Throughput:
|
| 6594 |
+
Tokens/sec: 26109.7
|
| 6595 |
+
Std Dev: 51610.7
|
| 6596 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6597 |
|
| 6598 |
Saved benchmark results to megablocks_results.json
|
|
|
|
| 6602 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6603 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6604 |
<div class="uv-logs-content" style="display: none;">
|
| 6605 |
+
Downloading sympy (6.0MiB)
|
| 6606 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6607 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6608 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6609 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
| 6610 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6611 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6612 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6613 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6614 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
| 6615 |
Downloading triton (148.3MiB)
|
| 6616 |
Downloading networkx (1.9MiB)
|
| 6617 |
Downloading hf-xet (3.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6618 |
Downloading numpy (16.2MiB)
|
| 6619 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6620 |
+
Downloading setuptools (1.1MiB)
|
| 6621 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6622 |
+
Downloading torch (846.9MiB)
|
| 6623 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6624 |
Downloading nvidia-cufile-cu12
|
| 6625 |
Downloading hf-xet
|
| 6626 |
Downloading setuptools
|
|
|
|
| 6640 |
Downloading nvidia-cublas-cu12
|
| 6641 |
Downloading nvidia-cudnn-cu12
|
| 6642 |
Downloading torch
|
| 6643 |
+
Installed 37 packages in 453ms
|
| 6644 |
</div>
|
| 6645 |
</div>
|
| 6646 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6647 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:11, 5.69it/s]
|
| 6648 |
+
Fetching 66 files: 8%|▊ | 5/66 [00:00<00:02, 20.65it/s]
|
| 6649 |
+
Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:04, 12.19it/s]
|
| 6650 |
+
Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:03, 13.56it/s]
|
| 6651 |
+
Fetching 66 files: 61%|██████ | 40/66 [00:01<00:00, 41.01it/s]
|
| 6652 |
+
Fetching 66 files: 74%|███████▍ | 49/66 [00:01<00:00, 41.96it/s]
|
| 6653 |
+
Fetching 66 files: 86%|████████▋ | 57/66 [00:01<00:00, 43.75it/s]
|
| 6654 |
+
Fetching 66 files: 97%|█████████▋| 64/66 [00:01<00:00, 42.69it/s]
|
| 6655 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:02<00:00, 31.94it/s]</div>
|
| 6656 |
<div class="cell-artifacts">
|
| 6657 |
<h4>Artifacts:</h4>
|
| 6658 |
<a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
|
|
|
|
| 6669 |
<span onclick="toggleOutput('visualization')" style="cursor: pointer;">▼ output</span>
|
| 6670 |
<span id="uv-indicator-visualization" onclick="toggleUvLogsFromHeader('visualization')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6671 |
</span> |
|
| 6672 |
+
Cell: visualization | deps: matplotlib | 3.13s
|
| 6673 |
| <button class="run-btn" onclick="runCell('visualization')">▶ run</button>
|
| 6674 |
<button class="copy-btn" onclick="copyCell('visualization')">Copy</button>
|
| 6675 |
<a href="cells/visualization.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6915 |
Performance Summary:
|
| 6916 |
Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
|
| 6917 |
--------------------------------------------------------------------------------
|
| 6918 |
+
megablocks_results 3.83 8.44 26110 1.00x
|
| 6919 |
+
yamoe_results 4.25 4.26 23537 0.90x
|
| 6920 |
+
binned_results 36.73 38.66 2723 0.10x
|
| 6921 |
+
gptoss_results 45.42 50.89 2202 0.08x
|
| 6922 |
+
gptoss_training_results 47.22 51.26 2118 0.08x
|
| 6923 |
+
|
| 6924 |
+
Fastest: megablocks_results (3.83ms avg)
|
| 6925 |
+
Slowest: gptoss_training_results (47.22ms avg)
|
| 6926 |
+
Max Speedup: 12.3x
|
| 6927 |
</div>
|
| 6928 |
<div class="uv-install-logs" id="uv-logs-visualization">
|
| 6929 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6930 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
| 6931 |
Downloading matplotlib (8.3MiB)
|
| 6932 |
+
Downloading numpy (16.2MiB)
|
| 6933 |
+
Downloading pillow (6.3MiB)
|
| 6934 |
Downloading fonttools (4.7MiB)
|
| 6935 |
+
Downloading kiwisolver (1.4MiB)
|
| 6936 |
Downloading kiwisolver
|
| 6937 |
Downloading pillow
|
| 6938 |
Downloading fonttools
|
| 6939 |
Downloading matplotlib
|
| 6940 |
Downloading numpy
|
| 6941 |
+
Installed 11 packages in 49ms
|
| 6942 |
</div>
|
| 6943 |
</div>
|
| 6944 |
<div class="cell-artifacts">
|