Upload folder using huggingface_hub
Browse files- megablocks/cells/forward_only.py +101 -0
- megablocks/cells/nv.py +3 -0
- megablocks/megablocks_only.html +551 -105
- megablocks_yamoe/artifacts/binned_run/binned_results.json +9 -9
- megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json +9 -9
- megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json +9 -9
- megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json +10 -10
- megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc +0 -0
- megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc +0 -0
- megablocks_yamoe/cells/megablocks_run.py +1 -1
- megablocks_yamoe/megablocks_yamoe.html +80 -76
- megablocks_yamoe/torch_profile.html +217 -220
megablocks/cells/forward_only.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.12"
|
| 3 |
+
# dependencies = [
|
| 4 |
+
# "accelerate>=1.10.1",
|
| 5 |
+
# "torch>=2.7.0",
|
| 6 |
+
# "kernels==0.10.0",
|
| 7 |
+
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
+
# "ipdb>=0.13.13",
|
| 9 |
+
# "matplotlib>=3.7.2",
|
| 10 |
+
# "numpy>=1.24.3",
|
| 11 |
+
# ]
|
| 12 |
+
# ///
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
+
import time
|
| 17 |
+
import torch.nn as nn
|
| 18 |
+
from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
|
| 19 |
+
import sys
|
| 20 |
+
import torch.profiler
|
| 21 |
+
import gc
|
| 22 |
+
import logging
|
| 23 |
+
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
replace_kernel_forward_from_hub(GptOssRMSNorm, None)
|
| 27 |
+
|
| 28 |
+
# set to debug logging
|
| 29 |
+
logging.basicConfig(level=logging.INFO)
|
| 30 |
+
|
| 31 |
+
def reset_peak_memory_stats():
|
| 32 |
+
"""Clear CUDA cache and reset memory allocation counters."""
|
| 33 |
+
torch.cuda.empty_cache()
|
| 34 |
+
if torch.cuda.is_available():
|
| 35 |
+
torch.cuda.reset_peak_memory_stats()
|
| 36 |
+
gc.collect()
|
| 37 |
+
|
| 38 |
+
def get_memory_stats():
|
| 39 |
+
"""Get current and peak CUDA memory usage."""
|
| 40 |
+
if not torch.cuda.is_available():
|
| 41 |
+
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 42 |
+
return {
|
| 43 |
+
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 44 |
+
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 45 |
+
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 49 |
+
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 50 |
+
for mod in sys.modules.values():
|
| 51 |
+
if mod is None:
|
| 52 |
+
continue
|
| 53 |
+
obj = getattr(mod, cls_name, None)
|
| 54 |
+
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 55 |
+
setattr(obj, "kernel_layer_name", value)
|
| 56 |
+
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 57 |
+
return True
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# Init the model the normal way
|
| 62 |
+
model_id = "openai/gpt-oss-20b"
|
| 63 |
+
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 64 |
+
quantization_config = Mxfp4Config(dequantize=True)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
model = GptOssForCausalLM.from_pretrained(
|
| 69 |
+
model_id,
|
| 70 |
+
dtype="bfloat16",
|
| 71 |
+
device_map="auto",
|
| 72 |
+
use_kernels=True,
|
| 73 |
+
quantization_config=quantization_config,
|
| 74 |
+
).eval()
|
| 75 |
+
|
| 76 |
+
messages = [
|
| 77 |
+
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
inputs = tokenizer.apply_chat_template(
|
| 81 |
+
messages,
|
| 82 |
+
add_generation_prompt=True,
|
| 83 |
+
return_tensors="pt",
|
| 84 |
+
return_dict=True,
|
| 85 |
+
reasoning_effort="low",
|
| 86 |
+
).to("cuda")
|
| 87 |
+
|
| 88 |
+
max_tokens = 256
|
| 89 |
+
|
| 90 |
+
with torch.inference_mode():
|
| 91 |
+
start_time = time.perf_counter()
|
| 92 |
+
generated = model.generate(
|
| 93 |
+
**inputs,
|
| 94 |
+
max_new_tokens=max_tokens,
|
| 95 |
+
do_sample=False,
|
| 96 |
+
temperature=None,
|
| 97 |
+
)
|
| 98 |
+
end_time = time.perf_counter()
|
| 99 |
+
|
| 100 |
+
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 101 |
+
print(f"Generation took {end_time - start_time:.2f} seconds")
|
megablocks/cells/nv.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
|
| 3 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
megablocks/megablocks_only.html
CHANGED
|
@@ -3715,7 +3715,74 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3719 |
<p>First, we run the model without any custom kernels to get a reference point.</p>
|
| 3720 |
<h2>Forward</h2>
|
| 3721 |
<div class="cell" id="cell-no_kernels">
|
|
@@ -3725,7 +3792,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3725 |
<span onclick="toggleOutput('no_kernels')" style="cursor: pointer;">▼ output</span>
|
| 3726 |
<span id="uv-indicator-no_kernels" onclick="toggleUvLogsFromHeader('no_kernels')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3727 |
</span> |
|
| 3728 |
-
Cell: no_kernels |
|
| 3729 |
| <button class="run-btn" onclick="runCell('no_kernels')">▶ run</button>
|
| 3730 |
<button class="copy-btn" onclick="copyCell('no_kernels')">Copy</button>
|
| 3731 |
<a href="cells/no_kernels.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3972,37 +4039,37 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
|
|
| 3972 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 3973 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 3974 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
|
|
|
| 3975 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 3976 |
-
Downloading nvidia-
|
| 3977 |
-
Downloading
|
| 3978 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3979 |
-
Downloading fonttools (4.7MiB)
|
| 3980 |
Downloading hf-xet (3.0MiB)
|
| 3981 |
-
Downloading
|
| 3982 |
-
Downloading
|
| 3983 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3984 |
-
Downloading triton (148.4MiB)
|
| 3985 |
-
Downloading pygments (1.2MiB)
|
| 3986 |
-
Downloading kiwisolver (1.4MiB)
|
| 3987 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3988 |
Downloading pillow (6.3MiB)
|
| 3989 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3990 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3991 |
-
Downloading sympy (6.0MiB)
|
| 3992 |
-
Downloading jedi (1.5MiB)
|
| 3993 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3994 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3995 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3996 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
|
|
|
| 3997 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3998 |
-
Downloading nvidia-
|
| 3999 |
-
Downloading
|
|
|
|
| 4000 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
| 4001 |
Downloading nvidia-cufile-cu12
|
| 4002 |
Downloading kiwisolver
|
| 4003 |
Downloading pygments
|
| 4004 |
-
Downloading tokenizers
|
| 4005 |
Downloading hf-xet
|
|
|
|
| 4006 |
Downloading networkx
|
| 4007 |
Downloading fonttools
|
| 4008 |
Downloading pillow
|
|
@@ -4012,8 +4079,8 @@ Downloading matplotlib (8.3MiB)
|
|
| 4012 |
Downloading sympy
|
| 4013 |
Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4014 |
Downloading nvidia-nvjitlink-cu12
|
| 4015 |
-
Downloading jedi
|
| 4016 |
Downloading nvidia-curand-cu12
|
|
|
|
| 4017 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4018 |
Downloading triton
|
| 4019 |
Downloading nvidia-cufft-cu12
|
|
@@ -4024,13 +4091,13 @@ Downloading matplotlib (8.3MiB)
|
|
| 4024 |
Downloading nvidia-cublas-cu12
|
| 4025 |
Downloading nvidia-cudnn-cu12
|
| 4026 |
Downloading torch
|
| 4027 |
-
Installed 69 packages in
|
| 4028 |
</div>
|
| 4029 |
</div>
|
| 4030 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4031 |
-
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:
|
| 4032 |
-
Fetching 3 files: 67%|██████▋ | 2/3 [00:
|
| 4033 |
-
Fetching 3 files: 100%|██████████| 3/3 [00:
|
| 4034 |
|
| 4035 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4036 |
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.35s/it]
|
|
@@ -4049,7 +4116,7 @@ Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00
|
|
| 4049 |
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 4050 |
<span id="uv-indicator-forward_and_backward_no_kernel" onclick="toggleUvLogsFromHeader('forward_and_backward_no_kernel')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4051 |
</span> |
|
| 4052 |
-
Cell: forward_and_backward_no_kernel |
|
| 4053 |
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 4054 |
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 4055 |
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4475,14 +4542,14 @@ What is Tensor Parallelism?
|
|
| 4475 |
## 1. Why Tensor Parallelism?
|
| 4476 |
|
| 4477 |
- **Memory constraints**: Modern
|
| 4478 |
-
Generation took 13.
|
| 4479 |
Post-generation memory: {'allocated_gb': 9.398670336, 'peak_gb': 9.514059776, 'reserved_gb': 17.188257792}
|
| 4480 |
Enabled gradient checkpointing
|
| 4481 |
Post-forward memory: {'allocated_gb': 9.487933952, 'peak_gb': 9.514059776, 'reserved_gb': 17.188257792}
|
| 4482 |
Loss: 1.9761
|
| 4483 |
Running backward pass...
|
| 4484 |
Pre-backward memory: {'allocated_gb': 9.405890048, 'peak_gb': 9.514059776, 'reserved_gb': 17.177772032}
|
| 4485 |
-
OOM during forward/backward pass: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process
|
| 4486 |
Try reducing max_tokens or max_seq_len
|
| 4487 |
</div>
|
| 4488 |
<div class="uv-install-logs" id="uv-logs-forward_and_backward_no_kernel">
|
|
@@ -4492,37 +4559,37 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
|
|
| 4492 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4493 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4494 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4495 |
-
Downloading
|
|
|
|
| 4496 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4497 |
Downloading pygments (1.2MiB)
|
| 4498 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4499 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4500 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4501 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4502 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4503 |
Downloading jedi (1.5MiB)
|
| 4504 |
-
Downloading
|
| 4505 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4506 |
Downloading hf-xet (3.0MiB)
|
|
|
|
|
|
|
| 4507 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4508 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4509 |
-
Downloading numpy (15.9MiB)
|
| 4510 |
-
Downloading pillow (6.3MiB)
|
| 4511 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4512 |
-
Downloading
|
| 4513 |
-
Downloading nvidia-
|
|
|
|
| 4514 |
Downloading triton (148.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4515 |
Downloading kiwisolver (1.4MiB)
|
| 4516 |
-
Downloading nvidia-
|
| 4517 |
-
Downloading tokenizers (3.1MiB)
|
| 4518 |
-
Downloading matplotlib (8.3MiB)
|
| 4519 |
-
Downloading fonttools (4.7MiB)
|
| 4520 |
-
Downloading torch (846.8MiB)
|
| 4521 |
Downloading nvidia-cufile-cu12
|
| 4522 |
Downloading kiwisolver
|
| 4523 |
Downloading pygments
|
| 4524 |
-
Downloading hf-xet
|
| 4525 |
Downloading tokenizers
|
|
|
|
| 4526 |
Downloading networkx
|
| 4527 |
Downloading fonttools
|
| 4528 |
Downloading pillow
|
|
@@ -4532,28 +4599,28 @@ Downloading torch (846.8MiB)
|
|
| 4532 |
Downloading sympy
|
| 4533 |
Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4534 |
Downloading nvidia-nvjitlink-cu12
|
| 4535 |
-
Downloading jedi
|
| 4536 |
Downloading nvidia-curand-cu12
|
|
|
|
| 4537 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4538 |
Downloading triton
|
| 4539 |
Downloading nvidia-cufft-cu12
|
| 4540 |
Downloading nvidia-cusolver-cu12
|
| 4541 |
-
Downloading nvidia-cusparselt-cu12
|
| 4542 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4543 |
Downloading nvidia-nccl-cu12
|
| 4544 |
Downloading nvidia-cublas-cu12
|
| 4545 |
Downloading nvidia-cudnn-cu12
|
| 4546 |
Downloading torch
|
| 4547 |
-
Installed 69 packages in
|
| 4548 |
</div>
|
| 4549 |
</div>
|
| 4550 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4551 |
-
Fetching 3 files: 33%|███▎ | 1/3 [00:
|
| 4552 |
-
Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.
|
| 4553 |
-
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.
|
| 4554 |
|
| 4555 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4556 |
-
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.
|
| 4557 |
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 4558 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4559 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
|
@@ -4562,14 +4629,14 @@ Traceback (most recent call last):
|
|
| 4562 |
File "/repo/moe_benchmarks/megablocks/.uvnote/cells/forward_and_backward_no_kernel.py", line 154, in <module>
|
| 4563 |
loss.backward()
|
| 4564 |
~~~~~~~~~~~~~^^
|
| 4565 |
-
File "/tmp/uvnote-run-
|
| 4566 |
torch.autograd.backward(
|
| 4567 |
~~~~~~~~~~~~~~~~~~~~~~~^
|
| 4568 |
self, gradient, retain_graph, create_graph, inputs=inputs
|
| 4569 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4570 |
)
|
| 4571 |
^
|
| 4572 |
-
File "/tmp/uvnote-run-
|
| 4573 |
_engine_run_backward(
|
| 4574 |
~~~~~~~~~~~~~~~~~~~~^
|
| 4575 |
tensors,
|
|
@@ -4579,19 +4646,19 @@ Traceback (most recent call last):
|
|
| 4579 |
^^^^^^^^^^^^^^^^^^^^^
|
| 4580 |
)
|
| 4581 |
^
|
| 4582 |
-
File "/tmp/uvnote-run-
|
| 4583 |
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 4584 |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4585 |
t_outputs, *args, **kwargs
|
| 4586 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4587 |
) # Calls into the C++ engine to run the backward pass
|
| 4588 |
^
|
| 4589 |
-
File "/tmp/uvnote-run-
|
| 4590 |
return user_fn(self, *args)
|
| 4591 |
-
File "/tmp/uvnote-run-
|
| 4592 |
torch.autograd.backward(outputs_with_grad, args_with_grad)
|
| 4593 |
~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4594 |
-
File "/tmp/uvnote-run-
|
| 4595 |
_engine_run_backward(
|
| 4596 |
~~~~~~~~~~~~~~~~~~~~^
|
| 4597 |
tensors,
|
|
@@ -4601,14 +4668,14 @@ Traceback (most recent call last):
|
|
| 4601 |
^^^^^^^^^^^^^^^^^^^^^
|
| 4602 |
)
|
| 4603 |
^
|
| 4604 |
-
File "/tmp/uvnote-run-
|
| 4605 |
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 4606 |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4607 |
t_outputs, *args, **kwargs
|
| 4608 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4609 |
) # Calls into the C++ engine to run the backward pass
|
| 4610 |
^
|
| 4611 |
-
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process
|
| 4612 |
</div>
|
| 4613 |
</div>
|
| 4614 |
|
|
@@ -4616,6 +4683,384 @@ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2
|
|
| 4616 |
<p>Next we can run with Megablocks kernels enabled.</p>
|
| 4617 |
<h3>Forward</h3>
|
| 4618 |
<p>First, we run a forward pass with Megablocks kernels.</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4619 |
<h2>Forward and Backward</h2>
|
| 4620 |
<p>Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.</p>
|
| 4621 |
<div class="cell" id="cell-forward_and_backward">
|
|
@@ -4625,7 +5070,7 @@ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2
|
|
| 4625 |
<span onclick="toggleOutput('forward_and_backward')" style="cursor: pointer;">▼ output</span>
|
| 4626 |
<span id="uv-indicator-forward_and_backward" onclick="toggleUvLogsFromHeader('forward_and_backward')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4627 |
</span> |
|
| 4628 |
-
Cell: forward_and_backward |
|
| 4629 |
| <button class="run-btn" onclick="runCell('forward_and_backward')">▶ run</button>
|
| 4630 |
<button class="copy-btn" onclick="copyCell('forward_and_backward')">Copy</button>
|
| 4631 |
<a href="cells/forward_and_backward.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5045,7 +5490,7 @@ Reasoning: low
|
|
| 5045 |
What is Tensor Parallelism?
|
| 5046 |
|
| 5047 |
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it's
|
| 5048 |
-
Generation took 17.
|
| 5049 |
Post-generation memory: {'allocated_gb': 9.398670336, 'peak_gb': 9.67278848, 'reserved_gb': 17.188257792}
|
| 5050 |
Enabled gradient checkpointing
|
| 5051 |
Post-forward memory: {'allocated_gb': 9.487933952, 'peak_gb': 9.67278848, 'reserved_gb': 17.188257792}
|
|
@@ -5076,85 +5521,86 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
|
|
| 5076 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 5077 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 5078 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5079 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
|
|
|
|
|
|
|
|
|
| 5080 |
Downloading networkx (1.9MiB)
|
| 5081 |
-
Downloading pygments (1.2MiB)
|
| 5082 |
-
Downloading jedi (1.5MiB)
|
| 5083 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5084 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5085 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5086 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5087 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5088 |
-
Downloading numpy (15.9MiB)
|
| 5089 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5090 |
-
Downloading hf-xet (3.0MiB)
|
| 5091 |
Downloading pillow (6.3MiB)
|
| 5092 |
-
Downloading nvidia-
|
| 5093 |
-
Downloading
|
|
|
|
| 5094 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5095 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5096 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5097 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5098 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5099 |
-
Downloading matplotlib (8.3MiB)
|
| 5100 |
-
Downloading tokenizers (3.1MiB)
|
| 5101 |
-
Downloading torch (846.8MiB)
|
| 5102 |
Downloading kiwisolver (1.4MiB)
|
| 5103 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
| 5104 |
Downloading triton (148.4MiB)
|
| 5105 |
Downloading nvidia-cufile-cu12
|
| 5106 |
Downloading kiwisolver
|
| 5107 |
Downloading pygments
|
| 5108 |
-
Downloading networkx
|
| 5109 |
Downloading hf-xet
|
| 5110 |
Downloading tokenizers
|
| 5111 |
-
Downloading
|
| 5112 |
Downloading fonttools
|
| 5113 |
-
Downloading sympy
|
| 5114 |
Downloading pillow
|
| 5115 |
Downloading matplotlib
|
| 5116 |
Downloading nvidia-cuda-cupti-cu12
|
| 5117 |
Downloading numpy
|
| 5118 |
-
|
| 5119 |
Downloading nvidia-nvjitlink-cu12
|
|
|
|
|
|
|
| 5120 |
Downloading nvidia-curand-cu12
|
| 5121 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 5122 |
Downloading triton
|
| 5123 |
Downloading nvidia-cufft-cu12
|
| 5124 |
Downloading nvidia-cusolver-cu12
|
| 5125 |
-
Downloading nvidia-cusparselt-cu12
|
| 5126 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 5127 |
Downloading nvidia-nccl-cu12
|
| 5128 |
Downloading nvidia-cublas-cu12
|
| 5129 |
Downloading nvidia-cudnn-cu12
|
| 5130 |
Downloading torch
|
| 5131 |
-
Installed 69 packages in
|
| 5132 |
</div>
|
| 5133 |
</div>
|
| 5134 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 5135 |
-
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:15, 7.
|
| 5136 |
-
Fetching 3 files: 67%|██████▋ | 2/3 [00:09<00:04, 4.
|
| 5137 |
-
Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00, 3.
|
| 5138 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 5139 |
|
| 5140 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 5141 |
-
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.
|
| 5142 |
-
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.
|
| 5143 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 5144 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
| 5145 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5146 |
|
| 5147 |
Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 5148 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:13, 4.
|
| 5149 |
-
Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:
|
| 5150 |
-
Fetching 66 files:
|
| 5151 |
-
Fetching 66 files:
|
| 5152 |
-
Fetching 66 files:
|
| 5153 |
-
Fetching 66 files:
|
| 5154 |
-
Fetching 66 files:
|
| 5155 |
-
Fetching 66 files:
|
| 5156 |
-
Fetching 66 files:
|
| 5157 |
-
/
|
|
|
|
| 5158 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5159 |
warnings.warn(
|
| 5160 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
@@ -5181,7 +5627,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
|
|
| 5181 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5182 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5183 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5184 |
-
/tmp/uvnote-run-
|
| 5185 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5186 |
warnings.warn(
|
| 5187 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
@@ -5208,7 +5654,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
|
|
| 5208 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5209 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5210 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5211 |
-
/tmp/uvnote-run-
|
| 5212 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5213 |
warnings.warn(
|
| 5214 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
@@ -5236,7 +5682,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
|
|
| 5236 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5237 |
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
| 5238 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5239 |
-
/tmp/uvnote-run-
|
| 5240 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5241 |
warnings.warn(
|
| 5242 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
+
<div class="cell" id="cell-nv">
|
| 3719 |
+
<div class="cell-header">
|
| 3720 |
+
<span class="collapse-indicators">
|
| 3721 |
+
<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
|
| 3722 |
+
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
+
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
+
</span> |
|
| 3725 |
+
Cell: nv | 0.71s
|
| 3726 |
+
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
+
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
+
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
+
</div>
|
| 3730 |
+
<div id="code-nv" class="cell-code" data-lines="3">
|
| 3731 |
+
<div class="highlight-with-lines">
|
| 3732 |
+
<div class="line-numbers" id="lines-nv">
|
| 3733 |
+
<a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
|
| 3734 |
+
<a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
|
| 3735 |
+
<a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
|
| 3736 |
+
</div>
|
| 3737 |
+
<div class="code-wrap">
|
| 3738 |
+
<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
|
| 3739 |
+
|
| 3740 |
+
<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">"nvidia-smi"</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
|
| 3741 |
+
</pre></div>
|
| 3742 |
+
|
| 3743 |
+
<div class="code-line-highlight" id="line-highlight-nv"></div>
|
| 3744 |
+
</div>
|
| 3745 |
+
</div>
|
| 3746 |
+
</div>
|
| 3747 |
+
<div id="output-nv" class="cell-output">
|
| 3748 |
+
<div class="cell-stdout">Wed Sep 24 20:58:22 2025
|
| 3749 |
+
+-----------------------------------------------------------------------------------------+
|
| 3750 |
+
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
+
|-----------------------------------------+------------------------+----------------------+
|
| 3752 |
+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3753 |
+
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3754 |
+
| | | MIG M. |
|
| 3755 |
+
|=========================================+========================+======================|
|
| 3756 |
+
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
+
| 0% 32C P8 27W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
+
| | | N/A |
|
| 3759 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
+
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
+
| 0% 32C P8 25W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
+
| | | N/A |
|
| 3763 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
+
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
+
| 0% 32C P8 28W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
+
| | | N/A |
|
| 3767 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
+
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
+
| 0% 32C P8 27W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
+
| | | N/A |
|
| 3771 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
+
|
| 3773 |
+
+-----------------------------------------------------------------------------------------+
|
| 3774 |
+
| Processes: |
|
| 3775 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 3776 |
+
| ID ID Usage |
|
| 3777 |
+
|=========================================================================================|
|
| 3778 |
+
| No running processes found |
|
| 3779 |
+
+-----------------------------------------------------------------------------------------+
|
| 3780 |
+
|
| 3781 |
+
</div>
|
| 3782 |
+
</div>
|
| 3783 |
+
</div>
|
| 3784 |
+
|
| 3785 |
+
<h1>No Kernels</h1>
|
| 3786 |
<p>First, we run the model without any custom kernels to get a reference point.</p>
|
| 3787 |
<h2>Forward</h2>
|
| 3788 |
<div class="cell" id="cell-no_kernels">
|
|
|
|
| 3792 |
<span onclick="toggleOutput('no_kernels')" style="cursor: pointer;">▼ output</span>
|
| 3793 |
<span id="uv-indicator-no_kernels" onclick="toggleUvLogsFromHeader('no_kernels')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3794 |
</span> |
|
| 3795 |
+
Cell: no_kernels | 107.24s
|
| 3796 |
| <button class="run-btn" onclick="runCell('no_kernels')">▶ run</button>
|
| 3797 |
<button class="copy-btn" onclick="copyCell('no_kernels')">Copy</button>
|
| 3798 |
<a href="cells/no_kernels.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4039 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4040 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4041 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4042 |
+
Downloading jedi (1.5MiB)
|
| 4043 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4044 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4045 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
| 4046 |
Downloading hf-xet (3.0MiB)
|
| 4047 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4048 |
+
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4049 |
Downloading pillow (6.3MiB)
|
|
|
|
| 4050 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
| 4051 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4052 |
+
Downloading numpy (15.9MiB)
|
| 4053 |
+
Downloading fonttools (4.7MiB)
|
| 4054 |
+
Downloading networkx (1.9MiB)
|
| 4055 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4056 |
+
Downloading triton (148.4MiB)
|
| 4057 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4058 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4059 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4060 |
+
Downloading tokenizers (3.1MiB)
|
| 4061 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4062 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4063 |
+
Downloading pygments (1.2MiB)
|
| 4064 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4065 |
Downloading matplotlib (8.3MiB)
|
| 4066 |
+
Downloading kiwisolver (1.4MiB)
|
| 4067 |
+
Downloading torch (846.8MiB)
|
| 4068 |
Downloading nvidia-cufile-cu12
|
| 4069 |
Downloading kiwisolver
|
| 4070 |
Downloading pygments
|
|
|
|
| 4071 |
Downloading hf-xet
|
| 4072 |
+
Downloading tokenizers
|
| 4073 |
Downloading networkx
|
| 4074 |
Downloading fonttools
|
| 4075 |
Downloading pillow
|
|
|
|
| 4079 |
Downloading sympy
|
| 4080 |
Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4081 |
Downloading nvidia-nvjitlink-cu12
|
|
|
|
| 4082 |
Downloading nvidia-curand-cu12
|
| 4083 |
+
Downloading jedi
|
| 4084 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4085 |
Downloading triton
|
| 4086 |
Downloading nvidia-cufft-cu12
|
|
|
|
| 4091 |
Downloading nvidia-cublas-cu12
|
| 4092 |
Downloading nvidia-cudnn-cu12
|
| 4093 |
Downloading torch
|
| 4094 |
+
Installed 69 packages in 565ms
|
| 4095 |
</div>
|
| 4096 |
</div>
|
| 4097 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4098 |
+
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:15, 7.69s/it]
|
| 4099 |
+
Fetching 3 files: 67%|██████▋ | 2/3 [00:09<00:03, 3.95s/it]
|
| 4100 |
+
Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00, 3.00s/it]
|
| 4101 |
|
| 4102 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4103 |
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.35s/it]
|
|
|
|
| 4116 |
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 4117 |
<span id="uv-indicator-forward_and_backward_no_kernel" onclick="toggleUvLogsFromHeader('forward_and_backward_no_kernel')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4118 |
</span> |
|
| 4119 |
+
Cell: forward_and_backward_no_kernel | 99.86s | FAILED
|
| 4120 |
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 4121 |
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 4122 |
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4542 |
## 1. Why Tensor Parallelism?
|
| 4543 |
|
| 4544 |
- **Memory constraints**: Modern
|
| 4545 |
+
Generation took 13.15 seconds
|
| 4546 |
Post-generation memory: {'allocated_gb': 9.398670336, 'peak_gb': 9.514059776, 'reserved_gb': 17.188257792}
|
| 4547 |
Enabled gradient checkpointing
|
| 4548 |
Post-forward memory: {'allocated_gb': 9.487933952, 'peak_gb': 9.514059776, 'reserved_gb': 17.188257792}
|
| 4549 |
Loss: 1.9761
|
| 4550 |
Running backward pass...
|
| 4551 |
Pre-backward memory: {'allocated_gb': 9.405890048, 'peak_gb': 9.514059776, 'reserved_gb': 17.177772032}
|
| 4552 |
+
OOM during forward/backward pass: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 34932 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
| 4553 |
Try reducing max_tokens or max_seq_len
|
| 4554 |
</div>
|
| 4555 |
<div class="uv-install-logs" id="uv-logs-forward_and_backward_no_kernel">
|
|
|
|
| 4559 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4560 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4561 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4562 |
+
Downloading numpy (15.9MiB)
|
| 4563 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4564 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4565 |
Downloading pygments (1.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4566 |
Downloading jedi (1.5MiB)
|
| 4567 |
+
Downloading tokenizers (3.1MiB)
|
|
|
|
| 4568 |
Downloading hf-xet (3.0MiB)
|
| 4569 |
+
Downloading sympy (6.0MiB)
|
| 4570 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4571 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4572 |
+
Downloading fonttools (4.7MiB)
|
| 4573 |
+
Downloading matplotlib (8.3MiB)
|
| 4574 |
+
Downloading networkx (1.9MiB)
|
| 4575 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4576 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
| 4577 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4578 |
+
Downloading torch (846.8MiB)
|
| 4579 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4580 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4581 |
Downloading triton (148.4MiB)
|
| 4582 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4583 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4584 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4585 |
+
Downloading pillow (6.3MiB)
|
| 4586 |
Downloading kiwisolver (1.4MiB)
|
| 4587 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4588 |
Downloading nvidia-cufile-cu12
|
| 4589 |
Downloading kiwisolver
|
| 4590 |
Downloading pygments
|
|
|
|
| 4591 |
Downloading tokenizers
|
| 4592 |
+
Downloading hf-xet
|
| 4593 |
Downloading networkx
|
| 4594 |
Downloading fonttools
|
| 4595 |
Downloading pillow
|
|
|
|
| 4599 |
Downloading sympy
|
| 4600 |
Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4601 |
Downloading nvidia-nvjitlink-cu12
|
|
|
|
| 4602 |
Downloading nvidia-curand-cu12
|
| 4603 |
+
Downloading jedi
|
| 4604 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4605 |
Downloading triton
|
| 4606 |
Downloading nvidia-cufft-cu12
|
| 4607 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4608 |
Downloading nvidia-cusparse-cu12
|
| 4609 |
+
Downloading nvidia-cusparselt-cu12
|
| 4610 |
Downloading nvidia-nccl-cu12
|
| 4611 |
Downloading nvidia-cublas-cu12
|
| 4612 |
Downloading nvidia-cudnn-cu12
|
| 4613 |
Downloading torch
|
| 4614 |
+
Installed 69 packages in 592ms
|
| 4615 |
</div>
|
| 4616 |
</div>
|
| 4617 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4618 |
+
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:14, 7.40s/it]
|
| 4619 |
+
Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.77s/it]
|
| 4620 |
+
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.88s/it]
|
| 4621 |
|
| 4622 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4623 |
+
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.34s/it]
|
| 4624 |
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 4625 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4626 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
|
|
|
| 4629 |
File "/repo/moe_benchmarks/megablocks/.uvnote/cells/forward_and_backward_no_kernel.py", line 154, in <module>
|
| 4630 |
loss.backward()
|
| 4631 |
~~~~~~~~~~~~~^^
|
| 4632 |
+
File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/_tensor.py", line 647, in backward
|
| 4633 |
torch.autograd.backward(
|
| 4634 |
~~~~~~~~~~~~~~~~~~~~~~~^
|
| 4635 |
self, gradient, retain_graph, create_graph, inputs=inputs
|
| 4636 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4637 |
)
|
| 4638 |
^
|
| 4639 |
+
File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py", line 354, in backward
|
| 4640 |
_engine_run_backward(
|
| 4641 |
~~~~~~~~~~~~~~~~~~~~^
|
| 4642 |
tensors,
|
|
|
|
| 4646 |
^^^^^^^^^^^^^^^^^^^^^
|
| 4647 |
)
|
| 4648 |
^
|
| 4649 |
+
File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
|
| 4650 |
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 4651 |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4652 |
t_outputs, *args, **kwargs
|
| 4653 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4654 |
) # Calls into the C++ engine to run the backward pass
|
| 4655 |
^
|
| 4656 |
+
File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/function.py", line 311, in apply
|
| 4657 |
return user_fn(self, *args)
|
| 4658 |
+
File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/utils/checkpoint.py", line 319, in backward
|
| 4659 |
torch.autograd.backward(outputs_with_grad, args_with_grad)
|
| 4660 |
~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4661 |
+
File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py", line 354, in backward
|
| 4662 |
_engine_run_backward(
|
| 4663 |
~~~~~~~~~~~~~~~~~~~~^
|
| 4664 |
tensors,
|
|
|
|
| 4668 |
^^^^^^^^^^^^^^^^^^^^^
|
| 4669 |
)
|
| 4670 |
^
|
| 4671 |
+
File "/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
|
| 4672 |
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 4673 |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4674 |
t_outputs, *args, **kwargs
|
| 4675 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 4676 |
) # Calls into the C++ engine to run the backward pass
|
| 4677 |
^
|
| 4678 |
+
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 34932 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)</div>
|
| 4679 |
</div>
|
| 4680 |
</div>
|
| 4681 |
|
|
|
|
| 4683 |
<p>Next we can run with Megablocks kernels enabled.</p>
|
| 4684 |
<h3>Forward</h3>
|
| 4685 |
<p>First, we run a forward pass with Megablocks kernels.</p>
|
| 4686 |
+
<div class="cell" id="cell-forward_only">
|
| 4687 |
+
<div class="cell-header">
|
| 4688 |
+
<span class="collapse-indicators">
|
| 4689 |
+
<span onclick="toggleCode('forward_only')" style="cursor: pointer;">▼ code</span>
|
| 4690 |
+
<span onclick="toggleOutput('forward_only')" style="cursor: pointer;">▼ output</span>
|
| 4691 |
+
<span id="uv-indicator-forward_only" onclick="toggleUvLogsFromHeader('forward_only')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4692 |
+
</span> |
|
| 4693 |
+
Cell: forward_only | 114.71s
|
| 4694 |
+
| <button class="run-btn" onclick="runCell('forward_only')">▶ run</button>
|
| 4695 |
+
<button class="copy-btn" onclick="copyCell('forward_only')">Copy</button>
|
| 4696 |
+
<a href="cells/forward_only.py" target="_blank" class="raw-btn">Raw</a>
|
| 4697 |
+
</div>
|
| 4698 |
+
<div id="code-forward_only" class="cell-code" data-lines="101">
|
| 4699 |
+
<div class="highlight-with-lines">
|
| 4700 |
+
<div class="line-numbers" id="lines-forward_only">
|
| 4701 |
+
<a class="line-number" data-cell="forward_only" data-line="1" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 1, true);">1</a>
|
| 4702 |
+
<a class="line-number" data-cell="forward_only" data-line="2" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 2, true);">2</a>
|
| 4703 |
+
<a class="line-number" data-cell="forward_only" data-line="3" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 3, true);">3</a>
|
| 4704 |
+
<a class="line-number" data-cell="forward_only" data-line="4" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 4, true);">4</a>
|
| 4705 |
+
<a class="line-number" data-cell="forward_only" data-line="5" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 5, true);">5</a>
|
| 4706 |
+
<a class="line-number" data-cell="forward_only" data-line="6" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 6, true);">6</a>
|
| 4707 |
+
<a class="line-number" data-cell="forward_only" data-line="7" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 7, true);">7</a>
|
| 4708 |
+
<a class="line-number" data-cell="forward_only" data-line="8" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 8, true);">8</a>
|
| 4709 |
+
<a class="line-number" data-cell="forward_only" data-line="9" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 9, true);">9</a>
|
| 4710 |
+
<a class="line-number" data-cell="forward_only" data-line="10" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 10, true);">10</a>
|
| 4711 |
+
<a class="line-number" data-cell="forward_only" data-line="11" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 11, true);">11</a>
|
| 4712 |
+
<a class="line-number" data-cell="forward_only" data-line="12" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 12, true);">12</a>
|
| 4713 |
+
<a class="line-number" data-cell="forward_only" data-line="13" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 13, true);">13</a>
|
| 4714 |
+
<a class="line-number" data-cell="forward_only" data-line="14" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 14, true);">14</a>
|
| 4715 |
+
<a class="line-number" data-cell="forward_only" data-line="15" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 15, true);">15</a>
|
| 4716 |
+
<a class="line-number" data-cell="forward_only" data-line="16" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 16, true);">16</a>
|
| 4717 |
+
<a class="line-number" data-cell="forward_only" data-line="17" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 17, true);">17</a>
|
| 4718 |
+
<a class="line-number" data-cell="forward_only" data-line="18" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 18, true);">18</a>
|
| 4719 |
+
<a class="line-number" data-cell="forward_only" data-line="19" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 19, true);">19</a>
|
| 4720 |
+
<a class="line-number" data-cell="forward_only" data-line="20" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 20, true);">20</a>
|
| 4721 |
+
<a class="line-number" data-cell="forward_only" data-line="21" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 21, true);">21</a>
|
| 4722 |
+
<a class="line-number" data-cell="forward_only" data-line="22" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 22, true);">22</a>
|
| 4723 |
+
<a class="line-number" data-cell="forward_only" data-line="23" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 23, true);">23</a>
|
| 4724 |
+
<a class="line-number" data-cell="forward_only" data-line="24" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 24, true);">24</a>
|
| 4725 |
+
<a class="line-number" data-cell="forward_only" data-line="25" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 25, true);">25</a>
|
| 4726 |
+
<a class="line-number" data-cell="forward_only" data-line="26" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 26, true);">26</a>
|
| 4727 |
+
<a class="line-number" data-cell="forward_only" data-line="27" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 27, true);">27</a>
|
| 4728 |
+
<a class="line-number" data-cell="forward_only" data-line="28" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 28, true);">28</a>
|
| 4729 |
+
<a class="line-number" data-cell="forward_only" data-line="29" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 29, true);">29</a>
|
| 4730 |
+
<a class="line-number" data-cell="forward_only" data-line="30" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 30, true);">30</a>
|
| 4731 |
+
<a class="line-number" data-cell="forward_only" data-line="31" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 31, true);">31</a>
|
| 4732 |
+
<a class="line-number" data-cell="forward_only" data-line="32" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 32, true);">32</a>
|
| 4733 |
+
<a class="line-number" data-cell="forward_only" data-line="33" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 33, true);">33</a>
|
| 4734 |
+
<a class="line-number" data-cell="forward_only" data-line="34" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 34, true);">34</a>
|
| 4735 |
+
<a class="line-number" data-cell="forward_only" data-line="35" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 35, true);">35</a>
|
| 4736 |
+
<a class="line-number" data-cell="forward_only" data-line="36" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 36, true);">36</a>
|
| 4737 |
+
<a class="line-number" data-cell="forward_only" data-line="37" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 37, true);">37</a>
|
| 4738 |
+
<a class="line-number" data-cell="forward_only" data-line="38" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 38, true);">38</a>
|
| 4739 |
+
<a class="line-number" data-cell="forward_only" data-line="39" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 39, true);">39</a>
|
| 4740 |
+
<a class="line-number" data-cell="forward_only" data-line="40" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 40, true);">40</a>
|
| 4741 |
+
<a class="line-number" data-cell="forward_only" data-line="41" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 41, true);">41</a>
|
| 4742 |
+
<a class="line-number" data-cell="forward_only" data-line="42" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 42, true);">42</a>
|
| 4743 |
+
<a class="line-number" data-cell="forward_only" data-line="43" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 43, true);">43</a>
|
| 4744 |
+
<a class="line-number" data-cell="forward_only" data-line="44" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 44, true);">44</a>
|
| 4745 |
+
<a class="line-number" data-cell="forward_only" data-line="45" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 45, true);">45</a>
|
| 4746 |
+
<a class="line-number" data-cell="forward_only" data-line="46" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 46, true);">46</a>
|
| 4747 |
+
<a class="line-number" data-cell="forward_only" data-line="47" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 47, true);">47</a>
|
| 4748 |
+
<a class="line-number" data-cell="forward_only" data-line="48" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 48, true);">48</a>
|
| 4749 |
+
<a class="line-number" data-cell="forward_only" data-line="49" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 49, true);">49</a>
|
| 4750 |
+
<a class="line-number" data-cell="forward_only" data-line="50" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 50, true);">50</a>
|
| 4751 |
+
<a class="line-number" data-cell="forward_only" data-line="51" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 51, true);">51</a>
|
| 4752 |
+
<a class="line-number" data-cell="forward_only" data-line="52" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 52, true);">52</a>
|
| 4753 |
+
<a class="line-number" data-cell="forward_only" data-line="53" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 53, true);">53</a>
|
| 4754 |
+
<a class="line-number" data-cell="forward_only" data-line="54" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 54, true);">54</a>
|
| 4755 |
+
<a class="line-number" data-cell="forward_only" data-line="55" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 55, true);">55</a>
|
| 4756 |
+
<a class="line-number" data-cell="forward_only" data-line="56" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 56, true);">56</a>
|
| 4757 |
+
<a class="line-number" data-cell="forward_only" data-line="57" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 57, true);">57</a>
|
| 4758 |
+
<a class="line-number" data-cell="forward_only" data-line="58" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 58, true);">58</a>
|
| 4759 |
+
<a class="line-number" data-cell="forward_only" data-line="59" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 59, true);">59</a>
|
| 4760 |
+
<a class="line-number" data-cell="forward_only" data-line="60" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 60, true);">60</a>
|
| 4761 |
+
<a class="line-number" data-cell="forward_only" data-line="61" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 61, true);">61</a>
|
| 4762 |
+
<a class="line-number" data-cell="forward_only" data-line="62" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 62, true);">62</a>
|
| 4763 |
+
<a class="line-number" data-cell="forward_only" data-line="63" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 63, true);">63</a>
|
| 4764 |
+
<a class="line-number" data-cell="forward_only" data-line="64" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 64, true);">64</a>
|
| 4765 |
+
<a class="line-number" data-cell="forward_only" data-line="65" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 65, true);">65</a>
|
| 4766 |
+
<a class="line-number" data-cell="forward_only" data-line="66" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 66, true);">66</a>
|
| 4767 |
+
<a class="line-number" data-cell="forward_only" data-line="67" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 67, true);">67</a>
|
| 4768 |
+
<a class="line-number" data-cell="forward_only" data-line="68" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 68, true);">68</a>
|
| 4769 |
+
<a class="line-number" data-cell="forward_only" data-line="69" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 69, true);">69</a>
|
| 4770 |
+
<a class="line-number" data-cell="forward_only" data-line="70" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 70, true);">70</a>
|
| 4771 |
+
<a class="line-number" data-cell="forward_only" data-line="71" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 71, true);">71</a>
|
| 4772 |
+
<a class="line-number" data-cell="forward_only" data-line="72" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 72, true);">72</a>
|
| 4773 |
+
<a class="line-number" data-cell="forward_only" data-line="73" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 73, true);">73</a>
|
| 4774 |
+
<a class="line-number" data-cell="forward_only" data-line="74" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 74, true);">74</a>
|
| 4775 |
+
<a class="line-number" data-cell="forward_only" data-line="75" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 75, true);">75</a>
|
| 4776 |
+
<a class="line-number" data-cell="forward_only" data-line="76" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 76, true);">76</a>
|
| 4777 |
+
<a class="line-number" data-cell="forward_only" data-line="77" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 77, true);">77</a>
|
| 4778 |
+
<a class="line-number" data-cell="forward_only" data-line="78" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 78, true);">78</a>
|
| 4779 |
+
<a class="line-number" data-cell="forward_only" data-line="79" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 79, true);">79</a>
|
| 4780 |
+
<a class="line-number" data-cell="forward_only" data-line="80" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 80, true);">80</a>
|
| 4781 |
+
<a class="line-number" data-cell="forward_only" data-line="81" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 81, true);">81</a>
|
| 4782 |
+
<a class="line-number" data-cell="forward_only" data-line="82" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 82, true);">82</a>
|
| 4783 |
+
<a class="line-number" data-cell="forward_only" data-line="83" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 83, true);">83</a>
|
| 4784 |
+
<a class="line-number" data-cell="forward_only" data-line="84" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 84, true);">84</a>
|
| 4785 |
+
<a class="line-number" data-cell="forward_only" data-line="85" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 85, true);">85</a>
|
| 4786 |
+
<a class="line-number" data-cell="forward_only" data-line="86" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 86, true);">86</a>
|
| 4787 |
+
<a class="line-number" data-cell="forward_only" data-line="87" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 87, true);">87</a>
|
| 4788 |
+
<a class="line-number" data-cell="forward_only" data-line="88" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 88, true);">88</a>
|
| 4789 |
+
<a class="line-number" data-cell="forward_only" data-line="89" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 89, true);">89</a>
|
| 4790 |
+
<a class="line-number" data-cell="forward_only" data-line="90" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 90, true);">90</a>
|
| 4791 |
+
<a class="line-number" data-cell="forward_only" data-line="91" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 91, true);">91</a>
|
| 4792 |
+
<a class="line-number" data-cell="forward_only" data-line="92" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 92, true);">92</a>
|
| 4793 |
+
<a class="line-number" data-cell="forward_only" data-line="93" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 93, true);">93</a>
|
| 4794 |
+
<a class="line-number" data-cell="forward_only" data-line="94" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 94, true);">94</a>
|
| 4795 |
+
<a class="line-number" data-cell="forward_only" data-line="95" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 95, true);">95</a>
|
| 4796 |
+
<a class="line-number" data-cell="forward_only" data-line="96" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 96, true);">96</a>
|
| 4797 |
+
<a class="line-number" data-cell="forward_only" data-line="97" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 97, true);">97</a>
|
| 4798 |
+
<a class="line-number" data-cell="forward_only" data-line="98" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 98, true);">98</a>
|
| 4799 |
+
<a class="line-number" data-cell="forward_only" data-line="99" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 99, true);">99</a>
|
| 4800 |
+
<a class="line-number" data-cell="forward_only" data-line="100" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 100, true);">100</a>
|
| 4801 |
+
<a class="line-number" data-cell="forward_only" data-line="101" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 101, true);">101</a>
|
| 4802 |
+
</div>
|
| 4803 |
+
<div class="code-wrap">
|
| 4804 |
+
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
| 4805 |
+
<span class="c1"># requires-python = ">=3.12"</span>
|
| 4806 |
+
<span class="c1"># dependencies = [</span>
|
| 4807 |
+
<span class="c1"># "accelerate>=1.10.1",</span>
|
| 4808 |
+
<span class="c1"># "torch>=2.7.0",</span>
|
| 4809 |
+
<span class="c1"># "kernels==0.10.0",</span>
|
| 4810 |
+
<span class="c1"># "transformers@https://github.com/huggingface/transformers.git",</span>
|
| 4811 |
+
<span class="c1"># "ipdb>=0.13.13",</span>
|
| 4812 |
+
<span class="c1"># "matplotlib>=3.7.2",</span>
|
| 4813 |
+
<span class="c1"># "numpy>=1.24.3",</span>
|
| 4814 |
+
<span class="c1"># ]</span>
|
| 4815 |
+
<span class="c1"># ///</span>
|
| 4816 |
+
|
| 4817 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
| 4818 |
+
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssForCausalLM</span><span class="p">,</span> <span class="n">PreTrainedTokenizerFast</span><span class="p">,</span> <span class="n">Mxfp4Config</span>
|
| 4819 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
|
| 4820 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">torch.nn</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">nn</span>
|
| 4821 |
+
<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">register_kernel_mapping</span><span class="p">,</span> <span class="n">Mode</span><span class="p">,</span> <span class="n">LayerRepository</span><span class="p">,</span> <span class="n">replace_kernel_forward_from_hub</span>
|
| 4822 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
|
| 4823 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">torch.profiler</span>
|
| 4824 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">gc</span>
|
| 4825 |
+
<span class="kn">import</span><span class="w"> </span><span class="nn">logging</span>
|
| 4826 |
+
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssRMSNorm</span>
|
| 4827 |
+
|
| 4828 |
+
|
| 4829 |
+
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
| 4830 |
+
|
| 4831 |
+
<span class="c1"># set to debug logging</span>
|
| 4832 |
+
<span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">)</span>
|
| 4833 |
+
|
| 4834 |
+
<span class="k">def</span><span class="w"> </span><span class="nf">reset_peak_memory_stats</span><span class="p">():</span>
|
| 4835 |
+
<span class="w"> </span><span class="sd">"""Clear CUDA cache and reset memory allocation counters."""</span>
|
| 4836 |
+
<span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">empty_cache</span><span class="p">()</span>
|
| 4837 |
+
<span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span>
|
| 4838 |
+
<span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">reset_peak_memory_stats</span><span class="p">()</span>
|
| 4839 |
+
<span class="n">gc</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
|
| 4840 |
+
|
| 4841 |
+
<span class="k">def</span><span class="w"> </span><span class="nf">get_memory_stats</span><span class="p">():</span>
|
| 4842 |
+
<span class="w"> </span><span class="sd">"""Get current and peak CUDA memory usage."""</span>
|
| 4843 |
+
<span class="k">if</span> <span class="ow">not</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span>
|
| 4844 |
+
<span class="k">return</span> <span class="p">{</span><span class="s2">"allocated_gb"</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"peak_gb"</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">"reserved_gb"</span><span class="p">:</span> <span class="mi">0</span><span class="p">}</span>
|
| 4845 |
+
<span class="k">return</span> <span class="p">{</span>
|
| 4846 |
+
<span class="s2">"allocated_gb"</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">memory_allocated</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
|
| 4847 |
+
<span class="s2">"peak_gb"</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">max_memory_allocated</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
|
| 4848 |
+
<span class="s2">"reserved_gb"</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">memory_reserved</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
|
| 4849 |
+
<span class="p">}</span>
|
| 4850 |
+
|
| 4851 |
+
<span class="k">def</span><span class="w"> </span><span class="nf">override_kernel_layer_name</span><span class="p">(</span><span class="n">cls_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
| 4852 |
+
<span class="w"> </span><span class="sd">"""Helper to dynamically override the kernel_layer_name in a model class."""</span>
|
| 4853 |
+
<span class="k">for</span> <span class="n">mod</span> <span class="ow">in</span> <span class="n">sys</span><span class="o">.</span><span class="n">modules</span><span class="o">.</span><span class="n">values</span><span class="p">():</span>
|
| 4854 |
+
<span class="k">if</span> <span class="n">mod</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
| 4855 |
+
<span class="k">continue</span>
|
| 4856 |
+
<span class="n">obj</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">cls_name</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
| 4857 |
+
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="nb">type</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">issubclass</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
|
| 4858 |
+
<span class="nb">setattr</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="s2">"kernel_layer_name"</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
|
| 4859 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Overrode </span><span class="si">{</span><span class="n">cls_name</span><span class="si">}</span><span class="s2">.kernel_layer_name to </span><span class="si">{</span><span class="n">value</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
| 4860 |
+
<span class="k">return</span> <span class="kc">True</span>
|
| 4861 |
+
<span class="k">return</span> <span class="kc">False</span>
|
| 4862 |
+
|
| 4863 |
+
|
| 4864 |
+
<span class="c1"># Init the model the normal way</span>
|
| 4865 |
+
<span class="n">model_id</span> <span class="o">=</span> <span class="s2">"openai/gpt-oss-20b"</span>
|
| 4866 |
+
<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">PreTrainedTokenizerFast</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_id</span><span class="p">)</span>
|
| 4867 |
+
<span class="n">quantization_config</span> <span class="o">=</span> <span class="n">Mxfp4Config</span><span class="p">(</span><span class="n">dequantize</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
| 4868 |
+
|
| 4869 |
+
|
| 4870 |
+
|
| 4871 |
+
<span class="n">model</span> <span class="o">=</span> <span class="n">GptOssForCausalLM</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
|
| 4872 |
+
<span class="n">model_id</span><span class="p">,</span>
|
| 4873 |
+
<span class="n">dtype</span><span class="o">=</span><span class="s2">"bfloat16"</span><span class="p">,</span>
|
| 4874 |
+
<span class="n">device_map</span><span class="o">=</span><span class="s2">"auto"</span><span class="p">,</span>
|
| 4875 |
+
<span class="n">use_kernels</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
| 4876 |
+
<span class="n">quantization_config</span><span class="o">=</span><span class="n">quantization_config</span><span class="p">,</span>
|
| 4877 |
+
<span class="p">)</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
|
| 4878 |
+
|
| 4879 |
+
<span class="n">messages</span> <span class="o">=</span> <span class="p">[</span>
|
| 4880 |
+
<span class="p">{</span><span class="s2">"role"</span><span class="p">:</span> <span class="s2">"system"</span><span class="p">,</span> <span class="s2">"content"</span><span class="p">:</span> <span class="s2">"What is Tensor Parallelism?"</span><span class="p">},</span>
|
| 4881 |
+
<span class="p">]</span>
|
| 4882 |
+
|
| 4883 |
+
<span class="n">inputs</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">apply_chat_template</span><span class="p">(</span>
|
| 4884 |
+
<span class="n">messages</span><span class="p">,</span>
|
| 4885 |
+
<span class="n">add_generation_prompt</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
| 4886 |
+
<span class="n">return_tensors</span><span class="o">=</span><span class="s2">"pt"</span><span class="p">,</span>
|
| 4887 |
+
<span class="n">return_dict</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
| 4888 |
+
<span class="n">reasoning_effort</span><span class="o">=</span><span class="s2">"low"</span><span class="p">,</span>
|
| 4889 |
+
<span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s2">"cuda"</span><span class="p">)</span>
|
| 4890 |
+
|
| 4891 |
+
<span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">256</span>
|
| 4892 |
+
|
| 4893 |
+
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
|
| 4894 |
+
<span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
|
| 4895 |
+
<span class="n">generated</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
|
| 4896 |
+
<span class="o">**</span><span class="n">inputs</span><span class="p">,</span>
|
| 4897 |
+
<span class="n">max_new_tokens</span><span class="o">=</span><span class="n">max_tokens</span><span class="p">,</span>
|
| 4898 |
+
<span class="n">do_sample</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
| 4899 |
+
<span class="n">temperature</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
| 4900 |
+
<span class="p">)</span>
|
| 4901 |
+
<span class="n">end_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
|
| 4902 |
+
|
| 4903 |
+
<span class="nb">print</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">generated</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">skip_special_tokens</span><span class="o">=</span><span class="kc">False</span><span class="p">))</span>
|
| 4904 |
+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds"</span><span class="p">)</span>
|
| 4905 |
+
</pre></div>
|
| 4906 |
+
|
| 4907 |
+
<div class="code-line-highlight" id="line-highlight-forward_only"></div>
|
| 4908 |
+
</div>
|
| 4909 |
+
</div>
|
| 4910 |
+
</div>
|
| 4911 |
+
<div id="output-forward_only" class="cell-output">
|
| 4912 |
+
<div class="cell-stdout"><|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
|
| 4913 |
+
Knowledge cutoff: 2024-06
|
| 4914 |
+
Current date: 2025-09-24
|
| 4915 |
+
|
| 4916 |
+
Reasoning: low
|
| 4917 |
+
|
| 4918 |
+
# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions
|
| 4919 |
+
|
| 4920 |
+
What is Tensor Parallelism?
|
| 4921 |
+
|
| 4922 |
+
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it's also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in Megatron-LM: splitting weight matrices across GPUs. Provide mention of "tensor parallelism" in DeepSpeed: "ZeRO-Offload" etc. Provide mention
|
| 4923 |
+
Generation took 31.31 seconds
|
| 4924 |
+
</div>
|
| 4925 |
+
<div class="uv-install-logs" id="uv-logs-forward_only">
|
| 4926 |
+
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4927 |
+
<div class="uv-logs-content" style="display: none;">
|
| 4928 |
+
Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
|
| 4929 |
+
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4930 |
+
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4931 |
+
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4932 |
+
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4933 |
+
Downloading pygments (1.2MiB)
|
| 4934 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4935 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4936 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4937 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4938 |
+
Downloading hf-xet (3.0MiB)
|
| 4939 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4940 |
+
Downloading numpy (15.9MiB)
|
| 4941 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4942 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4943 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4944 |
+
Downloading pillow (6.3MiB)
|
| 4945 |
+
Downloading networkx (1.9MiB)
|
| 4946 |
+
Downloading sympy (6.0MiB)
|
| 4947 |
+
Downloading tokenizers (3.1MiB)
|
| 4948 |
+
Downloading jedi (1.5MiB)
|
| 4949 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4950 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4951 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4952 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4953 |
+
Downloading fonttools (4.7MiB)
|
| 4954 |
+
Downloading torch (846.8MiB)
|
| 4955 |
+
Downloading matplotlib (8.3MiB)
|
| 4956 |
+
Downloading kiwisolver (1.4MiB)
|
| 4957 |
+
Downloading triton (148.4MiB)
|
| 4958 |
+
Downloading nvidia-cufile-cu12
|
| 4959 |
+
Downloading kiwisolver
|
| 4960 |
+
Downloading pygments
|
| 4961 |
+
Downloading hf-xet
|
| 4962 |
+
Downloading tokenizers
|
| 4963 |
+
Downloading networkx
|
| 4964 |
+
Downloading fonttools
|
| 4965 |
+
Downloading pillow
|
| 4966 |
+
Downloading matplotlib
|
| 4967 |
+
Downloading nvidia-cuda-cupti-cu12
|
| 4968 |
+
Downloading numpy
|
| 4969 |
+
Downloading sympy
|
| 4970 |
+
Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4971 |
+
Downloading nvidia-nvjitlink-cu12
|
| 4972 |
+
Downloading jedi
|
| 4973 |
+
Downloading nvidia-curand-cu12
|
| 4974 |
+
Downloading nvidia-cuda-nvrtc-cu12
|
| 4975 |
+
Downloading triton
|
| 4976 |
+
Downloading nvidia-cufft-cu12
|
| 4977 |
+
Downloading nvidia-cusolver-cu12
|
| 4978 |
+
Downloading nvidia-cusparse-cu12
|
| 4979 |
+
Downloading nvidia-cusparselt-cu12
|
| 4980 |
+
Downloading nvidia-nccl-cu12
|
| 4981 |
+
Downloading nvidia-cublas-cu12
|
| 4982 |
+
Downloading nvidia-cudnn-cu12
|
| 4983 |
+
Downloading torch
|
| 4984 |
+
Installed 69 packages in 454ms
|
| 4985 |
+
</div>
|
| 4986 |
+
</div>
|
| 4987 |
+
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4988 |
+
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:14, 7.39s/it]
|
| 4989 |
+
Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.78s/it]
|
| 4990 |
+
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.88s/it]
|
| 4991 |
+
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4992 |
+
|
| 4993 |
+
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4994 |
+
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.34s/it]
|
| 4995 |
+
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 4996 |
+
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4997 |
+
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
| 4998 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4999 |
+
|
| 5000 |
+
Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 5001 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:10, 6.01it/s]
|
| 5002 |
+
Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:01, 31.85it/s]
|
| 5003 |
+
Fetching 66 files: 20%|█▉ | 13/66 [00:00<00:02, 24.06it/s]
|
| 5004 |
+
Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:03, 12.48it/s]
|
| 5005 |
+
Fetching 66 files: 74%|███████▍ | 49/66 [00:01<00:00, 53.80it/s]
|
| 5006 |
+
Fetching 66 files: 91%|█████████ | 60/66 [00:01<00:00, 57.68it/s]
|
| 5007 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 40.58it/s]
|
| 5008 |
+
/tmp/uvnote-run-_tyh_wp6/home/.cache/uv/environments-v2/forward-only-504a4941eac030a5/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 5009 |
+
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5010 |
+
warnings.warn(
|
| 5011 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5012 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5013 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5014 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5015 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5016 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5017 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5018 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5019 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5020 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5021 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5022 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5023 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5024 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5025 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5026 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5027 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5028 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5029 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5030 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5031 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5032 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5033 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5034 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5035 |
+
/tmp/uvnote-run-_tyh_wp6/home/.cache/uv/environments-v2/forward-only-504a4941eac030a5/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 5036 |
+
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5037 |
+
warnings.warn(
|
| 5038 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5039 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5040 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5041 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5042 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5043 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5044 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5045 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5046 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5047 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5048 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5049 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5050 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5051 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5052 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5053 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5054 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5055 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5056 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5057 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5058 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5059 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5060 |
+
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`</div>
|
| 5061 |
+
</div>
|
| 5062 |
+
</div>
|
| 5063 |
+
|
| 5064 |
<h2>Forward and Backward</h2>
|
| 5065 |
<p>Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.</p>
|
| 5066 |
<div class="cell" id="cell-forward_and_backward">
|
|
|
|
| 5070 |
<span onclick="toggleOutput('forward_and_backward')" style="cursor: pointer;">▼ output</span>
|
| 5071 |
<span id="uv-indicator-forward_and_backward" onclick="toggleUvLogsFromHeader('forward_and_backward')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5072 |
</span> |
|
| 5073 |
+
Cell: forward_and_backward | 104.79s
|
| 5074 |
| <button class="run-btn" onclick="runCell('forward_and_backward')">▶ run</button>
|
| 5075 |
<button class="copy-btn" onclick="copyCell('forward_and_backward')">Copy</button>
|
| 5076 |
<a href="cells/forward_and_backward.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5490 |
What is Tensor Parallelism?
|
| 5491 |
|
| 5492 |
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it's
|
| 5493 |
+
Generation took 17.98 seconds
|
| 5494 |
Post-generation memory: {'allocated_gb': 9.398670336, 'peak_gb': 9.67278848, 'reserved_gb': 17.188257792}
|
| 5495 |
Enabled gradient checkpointing
|
| 5496 |
Post-forward memory: {'allocated_gb': 9.487933952, 'peak_gb': 9.67278848, 'reserved_gb': 17.188257792}
|
|
|
|
| 5521 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 5522 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 5523 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 5524 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5525 |
+
Downloading numpy (15.9MiB)
|
| 5526 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5527 |
+
Downloading hf-xet (3.0MiB)
|
| 5528 |
+
Downloading sympy (6.0MiB)
|
| 5529 |
+
Downloading jedi (1.5MiB)
|
| 5530 |
+
Downloading pygments (1.2MiB)
|
| 5531 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 5532 |
+
Downloading fonttools (4.7MiB)
|
| 5533 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5534 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5535 |
Downloading networkx (1.9MiB)
|
|
|
|
|
|
|
| 5536 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 5537 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5538 |
Downloading pillow (6.3MiB)
|
| 5539 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5540 |
+
Downloading matplotlib (8.3MiB)
|
| 5541 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5542 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
| 5543 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5544 |
Downloading kiwisolver (1.4MiB)
|
| 5545 |
+
Downloading torch (846.8MiB)
|
| 5546 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5547 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5548 |
+
Downloading tokenizers (3.1MiB)
|
| 5549 |
Downloading triton (148.4MiB)
|
| 5550 |
Downloading nvidia-cufile-cu12
|
| 5551 |
Downloading kiwisolver
|
| 5552 |
Downloading pygments
|
|
|
|
| 5553 |
Downloading hf-xet
|
| 5554 |
Downloading tokenizers
|
| 5555 |
+
Downloading networkx
|
| 5556 |
Downloading fonttools
|
|
|
|
| 5557 |
Downloading pillow
|
| 5558 |
Downloading matplotlib
|
| 5559 |
Downloading nvidia-cuda-cupti-cu12
|
| 5560 |
Downloading numpy
|
| 5561 |
+
Downloading sympy
|
| 5562 |
Downloading nvidia-nvjitlink-cu12
|
| 5563 |
+
Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 5564 |
+
Downloading jedi
|
| 5565 |
Downloading nvidia-curand-cu12
|
| 5566 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 5567 |
Downloading triton
|
| 5568 |
Downloading nvidia-cufft-cu12
|
| 5569 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 5570 |
Downloading nvidia-cusparse-cu12
|
| 5571 |
+
Downloading nvidia-cusparselt-cu12
|
| 5572 |
Downloading nvidia-nccl-cu12
|
| 5573 |
Downloading nvidia-cublas-cu12
|
| 5574 |
Downloading nvidia-cudnn-cu12
|
| 5575 |
Downloading torch
|
| 5576 |
+
Installed 69 packages in 506ms
|
| 5577 |
</div>
|
| 5578 |
</div>
|
| 5579 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 5580 |
+
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:15, 7.79s/it]
|
| 5581 |
+
Fetching 3 files: 67%|██████▋ | 2/3 [00:09<00:04, 4.50s/it]
|
| 5582 |
+
Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00, 3.33s/it]
|
| 5583 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 5584 |
|
| 5585 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 5586 |
+
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.36s/it]
|
| 5587 |
+
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 5588 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 5589 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
| 5590 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5591 |
|
| 5592 |
Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 5593 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:13, 4.68it/s]
|
| 5594 |
+
Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:02, 26.64it/s]
|
| 5595 |
+
Fetching 66 files: 21%|██ | 14/66 [00:00<00:01, 33.33it/s]
|
| 5596 |
+
Fetching 66 files: 27%|██▋ | 18/66 [00:00<00:02, 17.59it/s]
|
| 5597 |
+
Fetching 66 files: 53%|█████▎ | 35/66 [00:01<00:00, 43.25it/s]
|
| 5598 |
+
Fetching 66 files: 64%|██████▎ | 42/66 [00:01<00:00, 43.72it/s]
|
| 5599 |
+
Fetching 66 files: 74%|███████▍ | 49/66 [00:01<00:00, 40.60it/s]
|
| 5600 |
+
Fetching 66 files: 85%|████████▍ | 56/66 [00:01<00:00, 42.33it/s]
|
| 5601 |
+
Fetching 66 files: 95%|█████████▌| 63/66 [00:01<00:00, 38.03it/s]
|
| 5602 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 36.73it/s]
|
| 5603 |
+
/tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 5604 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5605 |
warnings.warn(
|
| 5606 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
|
|
| 5627 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5628 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5629 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5630 |
+
/tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 5631 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5632 |
warnings.warn(
|
| 5633 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
|
|
| 5654 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5655 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5656 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5657 |
+
/tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 5658 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5659 |
warnings.warn(
|
| 5660 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
|
|
| 5682 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5683 |
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
| 5684 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 5685 |
+
/tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 5686 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 5687 |
warnings.warn(
|
| 5688 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
megablocks_yamoe/artifacts/binned_run/binned_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms":
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 1.
|
| 16 |
-
"p50_ms": 36.
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms": 39.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 35.832872119995045,
|
| 13 |
+
"min_ms": 32.58174399991276,
|
| 14 |
+
"max_ms": 40.50060700001268,
|
| 15 |
+
"std_ms": 1.694341573523051,
|
| 16 |
+
"p50_ms": 36.17695449997882,
|
| 17 |
+
"p95_ms": 38.67062735003515,
|
| 18 |
+
"p99_ms": 39.92923416996405,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2790.733594145783,
|
| 21 |
+
"throughput_variance": 131.29596945634063
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms": 39.
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms":
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 46.790802699997585,
|
| 13 |
+
"min_ms": 39.03555299996242,
|
| 14 |
+
"max_ms": 50.85692799991648,
|
| 15 |
+
"std_ms": 3.250858562771192,
|
| 16 |
+
"p50_ms": 47.475618500016026,
|
| 17 |
+
"p95_ms": 50.805645549957035,
|
| 18 |
+
"p99_ms": 50.83896361993766,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2137.172141310693,
|
| 21 |
+
"throughput_variance": 155.17201487457513
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms": 38.
|
| 14 |
-
"max_ms": 49.
|
| 15 |
-
"std_ms": 2.
|
| 16 |
-
"p50_ms": 45.
|
| 17 |
-
"p95_ms": 48.
|
| 18 |
-
"p99_ms": 48.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 45.006849599990346,
|
| 13 |
+
"min_ms": 38.83674200005771,
|
| 14 |
+
"max_ms": 49.30821800007834,
|
| 15 |
+
"std_ms": 2.893955494967115,
|
| 16 |
+
"p50_ms": 45.57549300000119,
|
| 17 |
+
"p95_ms": 48.57250854988706,
|
| 18 |
+
"p99_ms": 48.963614720073565,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2221.8840218494533,
|
| 21 |
+
"throughput_variance": 147.8630259637854
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 4.
|
| 13 |
-
"min_ms": 4.
|
| 14 |
-
"max_ms": 4.
|
| 15 |
-
"std_ms": 0.
|
| 16 |
-
"p50_ms": 4.
|
| 17 |
-
"p95_ms": 4.
|
| 18 |
-
"p99_ms": 4.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
-
"output_sum": 3.
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 4.2496077999999216,
|
| 13 |
+
"min_ms": 4.143714000065302,
|
| 14 |
+
"max_ms": 4.276272000083736,
|
| 15 |
+
"std_ms": 0.02026809704303406,
|
| 16 |
+
"p50_ms": 4.251974999931463,
|
| 17 |
+
"p95_ms": 4.269103000035557,
|
| 18 |
+
"p99_ms": 4.276041210073345,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 23531.58331458302,
|
| 21 |
+
"throughput_variance": 113.86151920477748
|
| 22 |
},
|
| 23 |
+
"output_sum": 3.97190523147583
|
| 24 |
}
|
megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc
CHANGED
|
Binary files a/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
|
|
|
megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
|
|
|
megablocks_yamoe/cells/megablocks_run.py
CHANGED
|
@@ -56,7 +56,7 @@ def build_megablocks_model(device: torch.device):
|
|
| 56 |
# Attach loaded expert weights to the experts container
|
| 57 |
e = model.experts
|
| 58 |
e.alpha = 1.702
|
| 59 |
-
e.capacity_factor =
|
| 60 |
e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
|
| 61 |
e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
|
| 62 |
e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
|
|
|
|
| 56 |
# Attach loaded expert weights to the experts container
|
| 57 |
e = model.experts
|
| 58 |
e.alpha = 1.702
|
| 59 |
+
e.capacity_factor = 64
|
| 60 |
e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
|
| 61 |
e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
|
| 62 |
e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
|
megablocks_yamoe/megablocks_yamoe.html
CHANGED
|
@@ -3722,7 +3722,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3722 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
-
Cell: nv | 0.
|
| 3726 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3745,7 +3745,7 @@ Cell: nv | 0.54s
|
|
| 3745 |
</div>
|
| 3746 |
</div>
|
| 3747 |
<div id="output-nv" class="cell-output">
|
| 3748 |
-
<div class="cell-stdout">Wed Sep 24
|
| 3749 |
+-----------------------------------------------------------------------------------------+
|
| 3750 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
|-----------------------------------------+------------------------+----------------------+
|
|
@@ -3754,19 +3754,19 @@ Cell: nv | 0.54s
|
|
| 3754 |
| | | MIG M. |
|
| 3755 |
|=========================================+========================+======================|
|
| 3756 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
-
| 0%
|
| 3758 |
| | | N/A |
|
| 3759 |
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
-
| 0%
|
| 3762 |
| | | N/A |
|
| 3763 |
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
-
| 0%
|
| 3766 |
| | | N/A |
|
| 3767 |
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
-
| 0%
|
| 3770 |
| | | N/A |
|
| 3771 |
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
|
|
@@ -3792,7 +3792,7 @@ Cell: nv | 0.54s
|
|
| 3792 |
<span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
|
| 3793 |
<span id="uv-indicator-setup2" onclick="toggleUvLogsFromHeader('setup2')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3794 |
</span> |
|
| 3795 |
-
Cell: setup2 | 113.
|
| 3796 |
| <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
|
| 3797 |
<button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
|
| 3798 |
<a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4050,7 +4050,7 @@ Reasoning: low
|
|
| 4050 |
What is Tensor Parallelism?
|
| 4051 |
|
| 4052 |
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it's also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in Megatron-LM: splitting weight matrices across GPUs. Provide mention of "tensor parallelism" in DeepSpeed: "ZeRO-Offload" etc. Provide mention
|
| 4053 |
-
Generation took 31.
|
| 4054 |
</div>
|
| 4055 |
<div class="uv-install-logs" id="uv-logs-setup2">
|
| 4056 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
@@ -4059,32 +4059,32 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
|
|
| 4059 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4060 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4061 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4062 |
-
Downloading
|
| 4063 |
-
Downloading jedi (1.5MiB)
|
| 4064 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4065 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4066 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4067 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4068 |
Downloading hf-xet (3.0MiB)
|
| 4069 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4070 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4071 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4072 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
| 4073 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4074 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4075 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4076 |
Downloading matplotlib (8.3MiB)
|
| 4077 |
-
Downloading pygments (1.2MiB)
|
| 4078 |
-
Downloading networkx (1.9MiB)
|
| 4079 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4080 |
-
Downloading tokenizers (3.1MiB)
|
| 4081 |
-
Downloading pillow (6.3MiB)
|
| 4082 |
-
Downloading torch (846.8MiB)
|
| 4083 |
-
Downloading sympy (6.0MiB)
|
| 4084 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4085 |
Downloading fonttools (4.7MiB)
|
|
|
|
| 4086 |
Downloading kiwisolver (1.4MiB)
|
| 4087 |
-
Downloading
|
| 4088 |
Downloading nvidia-cufile-cu12
|
| 4089 |
Downloading kiwisolver
|
| 4090 |
Downloading pygments
|
|
@@ -4105,38 +4105,38 @@ Downloading triton (148.4MiB)
|
|
| 4105 |
Downloading triton
|
| 4106 |
Downloading nvidia-cufft-cu12
|
| 4107 |
Downloading nvidia-cusolver-cu12
|
| 4108 |
-
Downloading nvidia-cusparselt-cu12
|
| 4109 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4110 |
Downloading nvidia-nccl-cu12
|
| 4111 |
Downloading nvidia-cublas-cu12
|
| 4112 |
Downloading nvidia-cudnn-cu12
|
| 4113 |
Downloading torch
|
| 4114 |
-
Installed 69 packages in
|
| 4115 |
</div>
|
| 4116 |
</div>
|
| 4117 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4118 |
-
Fetching 3 files: 33%|███▎ | 1/3 [00:06<00:
|
| 4119 |
-
Fetching 3 files: 67%|██████▋ | 2/3 [00:
|
| 4120 |
-
Fetching 3 files: 100%|██████████| 3/3 [00:
|
| 4121 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4122 |
|
| 4123 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4124 |
-
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.
|
| 4125 |
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 4126 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4127 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
| 4128 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4129 |
|
| 4130 |
Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 4131 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:
|
| 4132 |
-
Fetching 66 files:
|
| 4133 |
-
Fetching 66 files:
|
| 4134 |
-
Fetching 66 files:
|
| 4135 |
-
Fetching 66 files:
|
| 4136 |
-
Fetching 66 files:
|
| 4137 |
-
Fetching 66 files:
|
| 4138 |
-
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00,
|
| 4139 |
-
/tmp/uvnote-run-
|
| 4140 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4141 |
warnings.warn(
|
| 4142 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
@@ -4163,7 +4163,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
|
|
| 4163 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4164 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4165 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4166 |
-
/tmp/uvnote-run-
|
| 4167 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4168 |
warnings.warn(
|
| 4169 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
@@ -4200,7 +4200,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
|
|
| 4200 |
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 4201 |
<span id="uv-indicator-setup" onclick="toggleUvLogsFromHeader('setup')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4202 |
</span> |
|
| 4203 |
-
Cell: setup |
|
| 4204 |
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 4205 |
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 4206 |
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4459,8 +4459,12 @@ Reasoning: low
|
|
| 4459 |
|
| 4460 |
What is Tensor Parallelism?
|
| 4461 |
|
| 4462 |
-
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices
|
| 4463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4464 |
</div>
|
| 4465 |
<div class="uv-install-logs" id="uv-logs-setup">
|
| 4466 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
@@ -4469,37 +4473,37 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
|
|
| 4469 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4470 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4471 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4472 |
-
Downloading
|
|
|
|
| 4473 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4474 |
-
Downloading
|
| 4475 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 4476 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4477 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4478 |
Downloading pillow (6.3MiB)
|
| 4479 |
-
Downloading nvidia-
|
| 4480 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4481 |
-
Downloading numpy (15.9MiB)
|
| 4482 |
Downloading fonttools (4.7MiB)
|
| 4483 |
-
Downloading tokenizers (3.1MiB)
|
| 4484 |
-
Downloading torch (846.8MiB)
|
| 4485 |
Downloading hf-xet (3.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4486 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4487 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4488 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4489 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4490 |
Downloading triton (148.4MiB)
|
| 4491 |
-
Downloading
|
| 4492 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4493 |
-
Downloading pygments (1.2MiB)
|
| 4494 |
-
Downloading matplotlib (8.3MiB)
|
| 4495 |
-
Downloading sympy (6.0MiB)
|
| 4496 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4497 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4498 |
Downloading nvidia-cufile-cu12
|
| 4499 |
Downloading kiwisolver
|
| 4500 |
Downloading pygments
|
| 4501 |
-
Downloading hf-xet
|
| 4502 |
Downloading tokenizers
|
|
|
|
| 4503 |
Downloading networkx
|
| 4504 |
Downloading fonttools
|
| 4505 |
Downloading pillow
|
|
@@ -4515,33 +4519,33 @@ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
| 4515 |
Downloading triton
|
| 4516 |
Downloading nvidia-cufft-cu12
|
| 4517 |
Downloading nvidia-cusolver-cu12
|
| 4518 |
-
Downloading nvidia-cusparselt-cu12
|
| 4519 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4520 |
Downloading nvidia-nccl-cu12
|
| 4521 |
Downloading nvidia-cublas-cu12
|
| 4522 |
Downloading nvidia-cudnn-cu12
|
| 4523 |
Downloading torch
|
| 4524 |
-
Installed 69 packages in
|
| 4525 |
</div>
|
| 4526 |
</div>
|
| 4527 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4528 |
-
Fetching 3 files: 33%|███▎ | 1/3 [00:
|
| 4529 |
-
Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.
|
| 4530 |
-
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.
|
| 4531 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4532 |
|
| 4533 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4534 |
-
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.
|
| 4535 |
-
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.
|
| 4536 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4537 |
-
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.
|
| 4538 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4539 |
|
| 4540 |
Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 4541 |
-
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01,
|
| 4542 |
-
Fetching 6 files:
|
| 4543 |
-
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00,
|
| 4544 |
-
/tmp/uvnote-run-
|
| 4545 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4546 |
warnings.warn(
|
| 4547 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
|
@@ -4568,7 +4572,7 @@ INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for laye
|
|
| 4568 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4569 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4570 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4571 |
-
/tmp/uvnote-run-
|
| 4572 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4573 |
warnings.warn(
|
| 4574 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
|
|
|
| 3722 |
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
+
Cell: nv | 0.53s
|
| 3726 |
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3745 |
</div>
|
| 3746 |
</div>
|
| 3747 |
<div id="output-nv" class="cell-output">
|
| 3748 |
+
<div class="cell-stdout">Wed Sep 24 21:05:30 2025
|
| 3749 |
+-----------------------------------------------------------------------------------------+
|
| 3750 |
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
|-----------------------------------------+------------------------+----------------------+
|
|
|
|
| 3754 |
| | | MIG M. |
|
| 3755 |
|=========================================+========================+======================|
|
| 3756 |
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
+
| 0% 38C P0 46W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
| | | N/A |
|
| 3759 |
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
+
| 0% 37C P0 45W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
| | | N/A |
|
| 3763 |
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
+
| 0% 39C P0 47W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
| | | N/A |
|
| 3767 |
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
+
| 0% 38C P0 46W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
| | | N/A |
|
| 3771 |
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
|
|
|
|
| 3792 |
<span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
|
| 3793 |
<span id="uv-indicator-setup2" onclick="toggleUvLogsFromHeader('setup2')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3794 |
</span> |
|
| 3795 |
+
Cell: setup2 | 113.64s
|
| 3796 |
| <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
|
| 3797 |
<button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
|
| 3798 |
<a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4050 |
What is Tensor Parallelism?
|
| 4051 |
|
| 4052 |
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it's used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it's also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of "tensor model parallelism" vs "tensor parallelism" synonyms. Provide mention of "tensor parallelism" in Megatron-LM: splitting weight matrices across GPUs. Provide mention of "tensor parallelism" in DeepSpeed: "ZeRO-Offload" etc. Provide mention
|
| 4053 |
+
Generation took 31.35 seconds
|
| 4054 |
</div>
|
| 4055 |
<div class="uv-install-logs" id="uv-logs-setup2">
|
| 4056 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
|
|
| 4059 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4060 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4061 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4062 |
+
Downloading sympy (6.0MiB)
|
|
|
|
| 4063 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
|
|
|
|
|
|
|
|
|
| 4064 |
Downloading hf-xet (3.0MiB)
|
| 4065 |
+
Downloading pillow (6.3MiB)
|
| 4066 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4067 |
+
Downloading networkx (1.9MiB)
|
| 4068 |
+
Downloading pygments (1.2MiB)
|
| 4069 |
+
Downloading tokenizers (3.1MiB)
|
| 4070 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4071 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4072 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4073 |
+
Downloading jedi (1.5MiB)
|
| 4074 |
+
Downloading numpy (15.9MiB)
|
| 4075 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4076 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4077 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4078 |
+
Downloading triton (148.4MiB)
|
| 4079 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4080 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4081 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4082 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
| 4083 |
Downloading matplotlib (8.3MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4084 |
Downloading fonttools (4.7MiB)
|
| 4085 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4086 |
Downloading kiwisolver (1.4MiB)
|
| 4087 |
+
Downloading torch (846.8MiB)
|
| 4088 |
Downloading nvidia-cufile-cu12
|
| 4089 |
Downloading kiwisolver
|
| 4090 |
Downloading pygments
|
|
|
|
| 4105 |
Downloading triton
|
| 4106 |
Downloading nvidia-cufft-cu12
|
| 4107 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4108 |
Downloading nvidia-cusparse-cu12
|
| 4109 |
+
Downloading nvidia-cusparselt-cu12
|
| 4110 |
Downloading nvidia-nccl-cu12
|
| 4111 |
Downloading nvidia-cublas-cu12
|
| 4112 |
Downloading nvidia-cudnn-cu12
|
| 4113 |
Downloading torch
|
| 4114 |
+
Installed 69 packages in 550ms
|
| 4115 |
</div>
|
| 4116 |
</div>
|
| 4117 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4118 |
+
Fetching 3 files: 33%|███▎ | 1/3 [00:06<00:12, 6.47s/it]
|
| 4119 |
+
Fetching 3 files: 67%|██████▋ | 2/3 [00:07<00:03, 3.37s/it]
|
| 4120 |
+
Fetching 3 files: 100%|██████████| 3/3 [00:07<00:00, 2.56s/it]
|
| 4121 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4122 |
|
| 4123 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4124 |
+
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.34s/it]
|
| 4125 |
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it]
|
| 4126 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4127 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
|
| 4128 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4129 |
|
| 4130 |
Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 4131 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:16, 3.87it/s]
|
| 4132 |
+
Fetching 66 files: 14%|█▎ | 9/66 [00:00<00:03, 18.15it/s]
|
| 4133 |
+
Fetching 66 files: 26%|██▌ | 17/66 [00:00<00:02, 24.03it/s]
|
| 4134 |
+
Fetching 66 files: 56%|█████▌ | 37/66 [00:00<00:00, 58.06it/s]
|
| 4135 |
+
Fetching 66 files: 71%|███████ | 47/66 [00:01<00:00, 37.14it/s]
|
| 4136 |
+
Fetching 66 files: 85%|████████▍ | 56/66 [00:01<00:00, 39.66it/s]
|
| 4137 |
+
Fetching 66 files: 98%|█████████▊| 65/66 [00:01<00:00, 42.21it/s]
|
| 4138 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 37.62it/s]
|
| 4139 |
+
/tmp/uvnote-run-e6cle3et/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 4140 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4141 |
warnings.warn(
|
| 4142 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
|
|
| 4163 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4164 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4165 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
| 4166 |
+
/tmp/uvnote-run-e6cle3et/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 4167 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4168 |
warnings.warn(
|
| 4169 |
INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
|
|
|
|
| 4200 |
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 4201 |
<span id="uv-indicator-setup" onclick="toggleUvLogsFromHeader('setup')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4202 |
</span> |
|
| 4203 |
+
Cell: setup | 108.22s
|
| 4204 |
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 4205 |
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 4206 |
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4459 |
|
| 4460 |
What is Tensor Parallelism?
|
| 4461 |
|
| 4462 |
+
<|end|><|start|>assistant<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of Megatron-LM, DeepSpeed, etc. Also mention that it's used for very large models that don't fit in a single GPU. Provide explanation of how it reduces memory usage and increases throughput. Also mention that it can be combined with other parallelism strategies. Provide mention of communication overhead, scaling, etc. Provide references. Provide code snippet maybe. Provide final answer.<|end|><|start|>assistant<|channel|>final<|message|>## Tensor Parallelism – A Quick Overview
|
| 4463 |
+
|
| 4464 |
+
| **Aspect** | **What it is** | **Why it matters** |
|
| 4465 |
+
|------------|----------------|--------------------|
|
| 4466 |
+
| **Definition** | Splitting a *single* weight tensor (e.g., a large matrix in a transformer layer) across multiple devices so that each device holds only a *portion* of the tensor. | Allows training of models that are
|
| 4467 |
+
Generation took 26.28 seconds
|
| 4468 |
</div>
|
| 4469 |
<div class="uv-install-logs" id="uv-logs-setup">
|
| 4470 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
|
|
|
| 4473 |
Downloading cpython-3.13.7-linux-x86_64-gnu (download)
|
| 4474 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4475 |
Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
|
| 4476 |
+
Downloading tokenizers (3.1MiB)
|
| 4477 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4478 |
Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
|
| 4479 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4480 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4481 |
+
Downloading pygments (1.2MiB)
|
| 4482 |
+
Downloading sympy (6.0MiB)
|
| 4483 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4484 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4485 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4486 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4487 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4488 |
+
Downloading networkx (1.9MiB)
|
| 4489 |
+
Downloading kiwisolver (1.4MiB)
|
| 4490 |
Downloading pillow (6.3MiB)
|
| 4491 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
| 4492 |
Downloading fonttools (4.7MiB)
|
|
|
|
|
|
|
| 4493 |
Downloading hf-xet (3.0MiB)
|
| 4494 |
+
Downloading numpy (15.9MiB)
|
| 4495 |
+
Downloading matplotlib (8.3MiB)
|
| 4496 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4497 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4498 |
+
Downloading jedi (1.5MiB)
|
| 4499 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
|
|
|
| 4500 |
Downloading triton (148.4MiB)
|
| 4501 |
+
Downloading torch (846.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4502 |
Downloading nvidia-cufile-cu12
|
| 4503 |
Downloading kiwisolver
|
| 4504 |
Downloading pygments
|
|
|
|
| 4505 |
Downloading tokenizers
|
| 4506 |
+
Downloading hf-xet
|
| 4507 |
Downloading networkx
|
| 4508 |
Downloading fonttools
|
| 4509 |
Downloading pillow
|
|
|
|
| 4519 |
Downloading triton
|
| 4520 |
Downloading nvidia-cufft-cu12
|
| 4521 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4522 |
Downloading nvidia-cusparse-cu12
|
| 4523 |
+
Downloading nvidia-cusparselt-cu12
|
| 4524 |
Downloading nvidia-nccl-cu12
|
| 4525 |
Downloading nvidia-cublas-cu12
|
| 4526 |
Downloading nvidia-cudnn-cu12
|
| 4527 |
Downloading torch
|
| 4528 |
+
Installed 69 packages in 462ms
|
| 4529 |
</div>
|
| 4530 |
</div>
|
| 4531 |
<div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4532 |
+
Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:14, 7.36s/it]
|
| 4533 |
+
Fetching 3 files: 67%|██████▋ | 2/3 [00:08<00:03, 3.69s/it]
|
| 4534 |
+
Fetching 3 files: 100%|██████████| 3/3 [00:08<00:00, 2.83s/it]
|
| 4535 |
You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
|
| 4536 |
|
| 4537 |
Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
|
| 4538 |
+
Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.36s/it]
|
| 4539 |
+
Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.26s/it]
|
| 4540 |
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it]
|
| 4541 |
+
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.94s/it]
|
| 4542 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4543 |
|
| 4544 |
Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 4545 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 2.82it/s]
|
| 4546 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 11.61it/s]
|
| 4547 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 10.04it/s]
|
| 4548 |
+
/tmp/uvnote-run-ga2bg_po/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 4549 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4550 |
warnings.warn(
|
| 4551 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
|
|
|
| 4572 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4573 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4574 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
| 4575 |
+
/tmp/uvnote-run-ga2bg_po/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
|
| 4576 |
No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
|
| 4577 |
warnings.warn(
|
| 4578 |
INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
|
megablocks_yamoe/torch_profile.html
CHANGED
|
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
-
Cell: utils | deps: torch, numpy | 34.
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3794,24 +3794,24 @@ Cell: utils | deps: torch, numpy | 34.17s
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
| 3797 |
-
Downloading sympy (6.0MiB)
|
| 3798 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3799 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3800 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3801 |
Downloading setuptools (1.1MiB)
|
| 3802 |
-
Downloading nvidia-
|
| 3803 |
-
Downloading
|
| 3804 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 3805 |
Downloading numpy (16.2MiB)
|
| 3806 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3807 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3808 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3809 |
-
Downloading nvidia-
|
| 3810 |
-
Downloading torch (846.9MiB)
|
| 3811 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3812 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 3813 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
| 3814 |
Downloading triton (148.3MiB)
|
|
|
|
|
|
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
@@ -3824,13 +3824,13 @@ Downloading triton (148.3MiB)
|
|
| 3824 |
Downloading triton
|
| 3825 |
Downloading nvidia-cufft-cu12
|
| 3826 |
Downloading nvidia-cusolver-cu12
|
| 3827 |
-
Downloading nvidia-cusparse-cu12
|
| 3828 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 3829 |
Downloading nvidia-nccl-cu12
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
-
Installed 26 packages in
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
@@ -3843,7 +3843,7 @@ Installed 26 packages in 465ms
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: bench_utils | deps: torch, numpy |
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 34.13s
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4335 |
Downloading setuptools (1.1MiB)
|
| 4336 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4337 |
-
Downloading triton (148.3MiB)
|
| 4338 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4339 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4340 |
-
Downloading numpy (16.2MiB)
|
| 4341 |
-
Downloading networkx (1.9MiB)
|
| 4342 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4343 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4344 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4345 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4346 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4347 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4348 |
-
Downloading torch (846.9MiB)
|
| 4349 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4350 |
-
Downloading nvidia-
|
| 4351 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
@@ -4361,8 +4361,8 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
| 4361 |
Downloading triton
|
| 4362 |
Downloading nvidia-cufft-cu12
|
| 4363 |
Downloading nvidia-cusolver-cu12
|
| 4364 |
-
Downloading nvidia-cusparse-cu12
|
| 4365 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4366 |
Downloading nvidia-nccl-cu12
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
|
@@ -4381,7 +4381,7 @@ Installed 26 packages in 445ms
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
-
Cell: config | deps: torch, numpy |
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4442,23 +4442,23 @@ Cell: config | deps: torch, numpy | 35.83s
|
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
Downloading sympy (6.0MiB)
|
| 4445 |
-
Downloading networkx (1.9MiB)
|
| 4446 |
-
Downloading setuptools (1.1MiB)
|
| 4447 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4448 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4449 |
-
Downloading torch (846.9MiB)
|
| 4450 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4451 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4452 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
| 4453 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4454 |
-
Downloading
|
| 4455 |
-
Downloading
|
| 4456 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4457 |
-
Downloading numpy (16.2MiB)
|
| 4458 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
|
|
|
| 4459 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
| 4460 |
Downloading triton (148.3MiB)
|
| 4461 |
-
Downloading nvidia-
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
@@ -4471,13 +4471,13 @@ Downloading nvidia-curand-cu12 (60.7MiB)
|
|
| 4471 |
Downloading triton
|
| 4472 |
Downloading nvidia-cufft-cu12
|
| 4473 |
Downloading nvidia-cusolver-cu12
|
| 4474 |
-
Downloading nvidia-cusparselt-cu12
|
| 4475 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
-
Installed 26 packages in
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
@@ -4490,7 +4490,7 @@ Installed 26 packages in 564ms
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
-
Cell: save_data | deps: torch, numpy | 39.
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
-
Downloading
|
| 4589 |
-
Downloading setuptools (1.1MiB)
|
| 4590 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4591 |
-
Downloading sympy (6.0MiB)
|
| 4592 |
-
Downloading numpy (16.2MiB)
|
| 4593 |
-
Downloading torch (846.9MiB)
|
| 4594 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4595 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4596 |
-
Downloading
|
|
|
|
| 4597 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4598 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4599 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4600 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4601 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4602 |
-
Downloading triton (148.3MiB)
|
| 4603 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4604 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4605 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
@@ -4621,16 +4621,16 @@ Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
-
Installed 26 packages in
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
-
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
| 4630 |
-
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4631 |
-
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4632 |
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
|
|
|
| 4633 |
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
|
|
|
|
|
|
| 4634 |
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4635 |
</div>
|
| 4636 |
</div>
|
|
@@ -4645,7 +4645,7 @@ Installed 26 packages in 447ms
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
-
Cell: yamoe_run | deps: torch, kernels, numpy |
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4938,9 +4938,9 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
-
Progress: 20% complete (avg: 4.
|
| 4942 |
-
Progress: 40% complete (avg: 4.
|
| 4943 |
-
Progress: 60% complete (avg: 4.
|
| 4944 |
Progress: 80% complete (avg: 4.249 ms)
|
| 4945 |
|
| 4946 |
Output tensors:
|
|
@@ -4951,19 +4951,19 @@ Output tensors:
|
|
| 4951 |
Iterations: 50
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
-
Average: 4.
|
| 4955 |
-
Min: 4.
|
| 4956 |
-
Max: 4.
|
| 4957 |
-
Std Dev: 0.
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
-
P50 (median): 4.
|
| 4961 |
-
P95: 4.
|
| 4962 |
-
P99: 4.
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
-
Tokens/sec:
|
| 4966 |
-
Std Dev:
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
|
@@ -4973,25 +4973,25 @@ Output sum: 3.971905
|
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
-
Downloading
|
|
|
|
| 4977 |
Downloading networkx (1.9MiB)
|
| 4978 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4979 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 4980 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
| 4981 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4982 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4983 |
-
Downloading setuptools (1.1MiB)
|
| 4984 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4985 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4986 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4987 |
-
Downloading hf-xet (3.0MiB)
|
| 4988 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4989 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4990 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 4991 |
Downloading triton (148.3MiB)
|
| 4992 |
-
Downloading
|
| 4993 |
-
Downloading nvidia-
|
| 4994 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
@@ -5011,14 +5011,13 @@ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
-
Installed 37 packages in
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
-
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:
|
| 5019 |
-
Fetching 6 files:
|
| 5020 |
-
Fetching 6 files:
|
| 5021 |
-
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 8.45it/s]</div>
|
| 5022 |
<div class="cell-artifacts">
|
| 5023 |
<h4>Artifacts:</h4>
|
| 5024 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
@@ -5035,7 +5034,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 8.4
|
|
| 5035 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5036 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5037 |
</span> |
|
| 5038 |
-
Cell: binned_run | deps: torch, numpy | 39.
|
| 5039 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5040 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5041 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5449,10 +5448,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5449 |
|
| 5450 |
Warming up (10 iterations)...
|
| 5451 |
Benchmarking (50 iterations)...
|
| 5452 |
-
Progress: 20% complete (avg:
|
| 5453 |
-
Progress: 40% complete (avg:
|
| 5454 |
-
Progress: 60% complete (avg:
|
| 5455 |
-
Progress: 80% complete (avg: 36.
|
| 5456 |
|
| 5457 |
Output tensors:
|
| 5458 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -5462,19 +5461,19 @@ Output tensors:
|
|
| 5462 |
Iterations: 50
|
| 5463 |
|
| 5464 |
Latency Statistics:
|
| 5465 |
-
Average:
|
| 5466 |
-
Min:
|
| 5467 |
-
Max:
|
| 5468 |
-
Std Dev: 1.
|
| 5469 |
|
| 5470 |
Percentiles:
|
| 5471 |
-
P50 (median): 36.
|
| 5472 |
-
P95:
|
| 5473 |
-
P99: 39.
|
| 5474 |
|
| 5475 |
Throughput:
|
| 5476 |
-
Tokens/sec:
|
| 5477 |
-
Std Dev:
|
| 5478 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5479 |
|
| 5480 |
Saved benchmark results to binned_results.json
|
|
@@ -5484,24 +5483,24 @@ Output sum: 3.971905
|
|
| 5484 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5485 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5486 |
<div class="uv-logs-content" style="display: none;">
|
| 5487 |
-
Downloading setuptools (1.1MiB)
|
| 5488 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5489 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5490 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5491 |
Downloading sympy (6.0MiB)
|
| 5492 |
-
Downloading nvidia-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5493 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5494 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5495 |
-
Downloading
|
| 5496 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5497 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5498 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5499 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5500 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5501 |
-
Downloading
|
| 5502 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 5503 |
Downloading torch (846.9MiB)
|
| 5504 |
-
Downloading
|
| 5505 |
Downloading nvidia-cufile-cu12
|
| 5506 |
Downloading setuptools
|
| 5507 |
Downloading networkx
|
|
@@ -5520,7 +5519,7 @@ Downloading numpy (16.2MiB)
|
|
| 5520 |
Downloading nvidia-cublas-cu12
|
| 5521 |
Downloading nvidia-cudnn-cu12
|
| 5522 |
Downloading torch
|
| 5523 |
-
Installed 26 packages in
|
| 5524 |
</div>
|
| 5525 |
</div>
|
| 5526 |
<div class="cell-artifacts">
|
|
@@ -5539,7 +5538,7 @@ Installed 26 packages in 453ms
|
|
| 5539 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5540 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5541 |
</span> |
|
| 5542 |
-
Cell: gptoss_run | deps: torch, numpy |
|
| 5543 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5544 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5545 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5857,10 +5856,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5857 |
|
| 5858 |
Warming up (10 iterations)...
|
| 5859 |
Benchmarking (50 iterations)...
|
| 5860 |
-
Progress: 20% complete (avg:
|
| 5861 |
-
Progress: 40% complete (avg:
|
| 5862 |
-
Progress: 60% complete (avg:
|
| 5863 |
-
Progress: 80% complete (avg:
|
| 5864 |
|
| 5865 |
Output tensors:
|
| 5866 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -5870,19 +5869,19 @@ Output tensors:
|
|
| 5870 |
Iterations: 50
|
| 5871 |
|
| 5872 |
Latency Statistics:
|
| 5873 |
-
Average:
|
| 5874 |
-
Min: 39.
|
| 5875 |
-
Max:
|
| 5876 |
-
Std Dev:
|
| 5877 |
|
| 5878 |
Percentiles:
|
| 5879 |
-
P50 (median):
|
| 5880 |
-
P95:
|
| 5881 |
-
P99:
|
| 5882 |
|
| 5883 |
Throughput:
|
| 5884 |
-
Tokens/sec:
|
| 5885 |
-
Std Dev:
|
| 5886 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5887 |
|
| 5888 |
Saved benchmark results to gptoss_results.json
|
|
@@ -5892,24 +5891,24 @@ Output sum: 11.532237
|
|
| 5892 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5893 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5894 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5895 |
Downloading sympy (6.0MiB)
|
| 5896 |
-
Downloading
|
| 5897 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5898 |
-
Downloading nvidia-
|
| 5899 |
-
Downloading nvidia-
|
| 5900 |
-
Downloading networkx (1.9MiB)
|
| 5901 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5902 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5903 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
| 5904 |
Downloading triton (148.3MiB)
|
| 5905 |
-
Downloading
|
| 5906 |
-
Downloading nvidia-
|
| 5907 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5908 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5909 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5910 |
-
Downloading setuptools (1.1MiB)
|
| 5911 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5912 |
-
Downloading numpy (16.2MiB)
|
| 5913 |
Downloading nvidia-cufile-cu12
|
| 5914 |
Downloading setuptools
|
| 5915 |
Downloading networkx
|
|
@@ -5928,7 +5927,7 @@ Downloading numpy (16.2MiB)
|
|
| 5928 |
Downloading nvidia-cublas-cu12
|
| 5929 |
Downloading nvidia-cudnn-cu12
|
| 5930 |
Downloading torch
|
| 5931 |
-
Installed 26 packages in
|
| 5932 |
</div>
|
| 5933 |
</div>
|
| 5934 |
<div class="cell-artifacts">
|
|
@@ -5947,7 +5946,7 @@ Installed 26 packages in 451ms
|
|
| 5947 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5948 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5949 |
</span> |
|
| 5950 |
-
Cell: gptoss_training_run | deps: torch, numpy | 39.
|
| 5951 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5952 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5953 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6248,10 +6247,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6248 |
|
| 6249 |
Warming up (10 iterations)...
|
| 6250 |
Benchmarking (50 iterations)...
|
| 6251 |
-
Progress: 20% complete (avg: 48.
|
| 6252 |
-
Progress: 40% complete (avg: 47.
|
| 6253 |
-
Progress: 60% complete (avg:
|
| 6254 |
-
Progress: 80% complete (avg:
|
| 6255 |
|
| 6256 |
Output tensors:
|
| 6257 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -6261,19 +6260,19 @@ Output tensors:
|
|
| 6261 |
Iterations: 50
|
| 6262 |
|
| 6263 |
Latency Statistics:
|
| 6264 |
-
Average:
|
| 6265 |
-
Min: 38.
|
| 6266 |
-
Max: 49.
|
| 6267 |
-
Std Dev: 2.
|
| 6268 |
|
| 6269 |
Percentiles:
|
| 6270 |
-
P50 (median): 45.
|
| 6271 |
-
P95: 48.
|
| 6272 |
-
P99: 48.
|
| 6273 |
|
| 6274 |
Throughput:
|
| 6275 |
-
Tokens/sec:
|
| 6276 |
-
Std Dev:
|
| 6277 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6278 |
|
| 6279 |
Saved benchmark results to gptoss_training_results.json
|
|
@@ -6283,24 +6282,24 @@ Output sum: 11.532237
|
|
| 6283 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6284 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6285 |
<div class="uv-logs-content" style="display: none;">
|
| 6286 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 6287 |
Downloading numpy (16.2MiB)
|
| 6288 |
-
Downloading nvidia-
|
|
|
|
| 6289 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6290 |
-
Downloading
|
| 6291 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6292 |
-
Downloading
|
| 6293 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6294 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6295 |
-
Downloading
|
| 6296 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6297 |
-
Downloading setuptools (1.1MiB)
|
| 6298 |
-
Downloading sympy (6.0MiB)
|
| 6299 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6300 |
-
Downloading nvidia-
|
| 6301 |
-
Downloading nvidia-
|
|
|
|
|
|
|
| 6302 |
Downloading torch (846.9MiB)
|
| 6303 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6304 |
Downloading nvidia-cufile-cu12
|
| 6305 |
Downloading setuptools
|
| 6306 |
Downloading networkx
|
|
@@ -6319,7 +6318,7 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
| 6319 |
Downloading nvidia-cublas-cu12
|
| 6320 |
Downloading nvidia-cudnn-cu12
|
| 6321 |
Downloading torch
|
| 6322 |
-
Installed 26 packages in
|
| 6323 |
</div>
|
| 6324 |
</div>
|
| 6325 |
<div class="cell-artifacts">
|
|
@@ -6338,7 +6337,7 @@ Installed 26 packages in 449ms
|
|
| 6338 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6339 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6340 |
</span> |
|
| 6341 |
-
Cell: megablocks_run | deps: torch, numpy, kernels |
|
| 6342 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6343 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6344 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6493,7 +6492,7 @@ Cell: megablocks_run | deps: torch, numpy, kernels | 40.94s | FAILED
|
|
| 6493 |
<span class="c1"># Attach loaded expert weights to the experts container</span>
|
| 6494 |
<span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
|
| 6495 |
<span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
|
| 6496 |
-
<span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">
|
| 6497 |
<span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
| 6498 |
<span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
| 6499 |
<span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
|
@@ -6570,25 +6569,25 @@ Warming up (10 iterations)...
|
|
| 6570 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6571 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6572 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
| 6573 |
Downloading networkx (1.9MiB)
|
| 6574 |
-
Downloading nvidia-
|
| 6575 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6576 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6577 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6578 |
-
Downloading triton (148.3MiB)
|
| 6579 |
-
Downloading numpy (16.2MiB)
|
| 6580 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6581 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6582 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6583 |
-
Downloading nvidia-
|
| 6584 |
-
Downloading
|
| 6585 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6586 |
-
Downloading sympy (6.0MiB)
|
| 6587 |
-
Downloading setuptools (1.1MiB)
|
| 6588 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6589 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 6590 |
Downloading hf-xet (3.0MiB)
|
| 6591 |
-
Downloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6592 |
Downloading nvidia-cufile-cu12
|
| 6593 |
Downloading hf-xet
|
| 6594 |
Downloading setuptools
|
|
@@ -6608,22 +6607,20 @@ Downloading torch (846.9MiB)
|
|
| 6608 |
Downloading nvidia-cublas-cu12
|
| 6609 |
Downloading nvidia-cudnn-cu12
|
| 6610 |
Downloading torch
|
| 6611 |
-
Installed 37 packages in
|
| 6612 |
</div>
|
| 6613 |
</div>
|
| 6614 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6615 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:
|
| 6616 |
-
Fetching 66 files: 6%|▌ | 4/66 [00:00<00:
|
| 6617 |
-
Fetching 66 files:
|
| 6618 |
-
Fetching 66 files:
|
| 6619 |
-
Fetching 66 files:
|
| 6620 |
-
Fetching 66 files:
|
| 6621 |
-
Fetching 66 files:
|
| 6622 |
-
Fetching 66 files:
|
| 6623 |
-
Fetching 66 files:
|
| 6624 |
-
|
| 6625 |
-
Fetching 66 files: 100%|██████████| 66/66 [00:02<00:00, 27.20it/s]
|
| 6626 |
-
/tmp/tmps8crtj9h/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
|
| 6627 |
5 | #include <Python.h>
|
| 6628 |
| ^~~~~~~~~~
|
| 6629 |
compilation terminated.
|
|
@@ -6640,87 +6637,87 @@ Traceback (most recent call last):
|
|
| 6640 |
File "/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py", line 177, in <lambda>
|
| 6641 |
call = lambda x: fn(x, *args[1:], **kwargs)
|
| 6642 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6643 |
-
File "/tmp/uvnote-run-
|
| 6644 |
return self._call_impl(*args, **kwargs)
|
| 6645 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6646 |
-
File "/tmp/uvnote-run-
|
| 6647 |
return forward_call(*args, **kwargs)
|
| 6648 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6649 |
File "/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py", line 81, in forward
|
| 6650 |
output, dummy_routing_weights = self.model(hidden_states)
|
| 6651 |
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6652 |
-
File "/tmp/uvnote-run-
|
| 6653 |
return self._call_impl(*args, **kwargs)
|
| 6654 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6655 |
-
File "/tmp/uvnote-run-
|
| 6656 |
return forward_call(*args, **kwargs)
|
| 6657 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6658 |
-
File "/tmp/uvnote-run-
|
| 6659 |
output, expert_weights_out, *_ = moe_forward(
|
| 6660 |
^^^^^^^^^^^^
|
| 6661 |
-
File "/tmp/uvnote-run-
|
| 6662 |
x, tokens_per_expert = forward_fn(**forward_args)
|
| 6663 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6664 |
-
File "/tmp/uvnote-run-
|
| 6665 |
x = permute_and_compute(
|
| 6666 |
^^^^^^^^^^^^^^^^^^^^
|
| 6667 |
-
File "/tmp/uvnote-run-
|
| 6668 |
x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
|
| 6669 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6670 |
-
File "/tmp/uvnote-run-
|
| 6671 |
return super().apply(*args, **kwargs) # type: ignore[misc]
|
| 6672 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6673 |
-
File "/tmp/uvnote-run-
|
| 6674 |
return fwd(*args, **kwargs)
|
| 6675 |
^^^^^^^^^^^^^^^^^^^^
|
| 6676 |
-
File "/tmp/uvnote-run-
|
| 6677 |
return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
|
| 6678 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6679 |
-
File "/tmp/uvnote-run-
|
| 6680 |
_binned_copy[(num_experts, expert_capacity)](
|
| 6681 |
-
File "/tmp/uvnote-run-
|
| 6682 |
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
|
| 6683 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6684 |
-
File "/tmp/uvnote-run-
|
| 6685 |
benchmark()
|
| 6686 |
-
File "/tmp/uvnote-run-
|
| 6687 |
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
|
| 6688 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6689 |
-
File "/tmp/uvnote-run-
|
| 6690 |
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
|
| 6691 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6692 |
-
File "/tmp/uvnote-run-
|
| 6693 |
return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
|
| 6694 |
^^^^^^^^^^^^^
|
| 6695 |
File "/usr/lib/python3.11/functools.py", line 1001, in __get__
|
| 6696 |
val = self.func(instance)
|
| 6697 |
^^^^^^^^^^^^^^^^^^^
|
| 6698 |
-
File "/tmp/uvnote-run-
|
| 6699 |
return driver.active.get_benchmarker()
|
| 6700 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6701 |
-
File "/tmp/uvnote-run-
|
| 6702 |
return getattr(self._initialize_obj(), name)
|
| 6703 |
^^^^^^^^^^^^^^^^^^^^^^
|
| 6704 |
-
File "/tmp/uvnote-run-
|
| 6705 |
self._obj = self._init_fn()
|
| 6706 |
^^^^^^^^^^^^^^^
|
| 6707 |
-
File "/tmp/uvnote-run-
|
| 6708 |
return active_drivers[0]()
|
| 6709 |
^^^^^^^^^^^^^^^^^^^
|
| 6710 |
-
File "/tmp/uvnote-run-
|
| 6711 |
self.utils = CudaUtils() # TODO: make static
|
| 6712 |
^^^^^^^^^^^
|
| 6713 |
-
File "/tmp/uvnote-run-
|
| 6714 |
mod = compile_module_from_src(
|
| 6715 |
^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6716 |
-
File "/tmp/uvnote-run-
|
| 6717 |
so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
|
| 6718 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6719 |
-
File "/tmp/uvnote-run-
|
| 6720 |
subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
|
| 6721 |
File "/usr/lib/python3.11/subprocess.py", line 413, in check_call
|
| 6722 |
raise CalledProcessError(retcode, cmd)
|
| 6723 |
-
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/
|
| 6724 |
</div>
|
| 6725 |
</div>
|
| 6726 |
|
|
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
+
Cell: utils | deps: torch, numpy | 34.25s
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 3797 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
| 3798 |
Downloading setuptools (1.1MiB)
|
| 3799 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3800 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3801 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3802 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3803 |
Downloading numpy (16.2MiB)
|
|
|
|
|
|
|
| 3804 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3805 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
| 3806 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3807 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3808 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3809 |
+
Downloading sympy (6.0MiB)
|
| 3810 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3811 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3812 |
Downloading triton (148.3MiB)
|
| 3813 |
+
Downloading torch (846.9MiB)
|
| 3814 |
+
Downloading networkx (1.9MiB)
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
|
|
| 3824 |
Downloading triton
|
| 3825 |
Downloading nvidia-cufft-cu12
|
| 3826 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 3827 |
Downloading nvidia-cusparselt-cu12
|
| 3828 |
+
Downloading nvidia-cusparse-cu12
|
| 3829 |
Downloading nvidia-nccl-cu12
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
+
Installed 26 packages in 446ms
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: bench_utils | deps: torch, numpy | 35.45s
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
+
Downloading numpy (16.2MiB)
|
| 4335 |
+
Downloading torch (846.9MiB)
|
| 4336 |
+
Downloading triton (148.3MiB)
|
| 4337 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4338 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4339 |
Downloading setuptools (1.1MiB)
|
| 4340 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4341 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4342 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4343 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4344 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4345 |
+
Downloading sympy (6.0MiB)
|
| 4346 |
+
Downloading networkx (1.9MiB)
|
| 4347 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
| 4348 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4349 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4350 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4351 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
|
|
| 4361 |
Downloading triton
|
| 4362 |
Downloading nvidia-cufft-cu12
|
| 4363 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4364 |
Downloading nvidia-cusparselt-cu12
|
| 4365 |
+
Downloading nvidia-cusparse-cu12
|
| 4366 |
Downloading nvidia-nccl-cu12
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
|
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
+
Cell: config | deps: torch, numpy | 34.31s
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
| 4445 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
|
|
|
| 4446 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 4447 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4448 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4449 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4450 |
+
Downloading torch (846.9MiB)
|
| 4451 |
+
Downloading networkx (1.9MiB)
|
|
|
|
|
|
|
| 4452 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4453 |
+
Downloading setuptools (1.1MiB)
|
| 4454 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4455 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4456 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4457 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4458 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4459 |
+
Downloading numpy (16.2MiB)
|
| 4460 |
Downloading triton (148.3MiB)
|
| 4461 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
|
|
| 4471 |
Downloading triton
|
| 4472 |
Downloading nvidia-cufft-cu12
|
| 4473 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4474 |
Downloading nvidia-cusparse-cu12
|
| 4475 |
+
Downloading nvidia-cusparselt-cu12
|
| 4476 |
Downloading nvidia-nccl-cu12
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
+
Installed 26 packages in 450ms
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
+
Cell: save_data | deps: torch, numpy | 39.54s
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4589 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4590 |
+
Downloading numpy (16.2MiB)
|
| 4591 |
+
Downloading setuptools (1.1MiB)
|
| 4592 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
| 4593 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
| 4594 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4595 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4596 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4597 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4598 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4599 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4600 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4601 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4602 |
+
Downloading sympy (6.0MiB)
|
| 4603 |
+
Downloading torch (846.9MiB)
|
| 4604 |
+
Downloading networkx (1.9MiB)
|
| 4605 |
+
Downloading triton (148.3MiB)
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
|
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
+
Installed 26 packages in 446ms
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
|
|
|
|
|
|
|
|
|
| 4629 |
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4630 |
+
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4631 |
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4632 |
+
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4633 |
+
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
| 4634 |
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4635 |
</div>
|
| 4636 |
</div>
|
|
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
+
Cell: yamoe_run | deps: torch, kernels, numpy | 39.10s
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
+
Progress: 20% complete (avg: 4.251 ms)
|
| 4942 |
+
Progress: 40% complete (avg: 4.248 ms)
|
| 4943 |
+
Progress: 60% complete (avg: 4.248 ms)
|
| 4944 |
Progress: 80% complete (avg: 4.249 ms)
|
| 4945 |
|
| 4946 |
Output tensors:
|
|
|
|
| 4951 |
Iterations: 50
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
+
Average: 4.250 ms
|
| 4955 |
+
Min: 4.144 ms
|
| 4956 |
+
Max: 4.276 ms
|
| 4957 |
+
Std Dev: 0.020 ms
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
+
P50 (median): 4.252 ms
|
| 4961 |
+
P95: 4.269 ms
|
| 4962 |
+
P99: 4.276 ms
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
+
Tokens/sec: 23531.6
|
| 4966 |
+
Std Dev: 113.9
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
|
|
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
+
Downloading hf-xet (3.0MiB)
|
| 4977 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4978 |
Downloading networkx (1.9MiB)
|
| 4979 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4980 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4981 |
+
Downloading setuptools (1.1MiB)
|
| 4982 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4983 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4984 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4985 |
+
Downloading numpy (16.2MiB)
|
| 4986 |
Downloading torch (846.9MiB)
|
| 4987 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4988 |
+
Downloading sympy (6.0MiB)
|
| 4989 |
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4990 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4991 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4992 |
Downloading triton (148.3MiB)
|
| 4993 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4994 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
|
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
+
Installed 37 packages in 454ms
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 3.47it/s]
|
| 5019 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 4.22it/s]
|
| 5020 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 8.26it/s]</div>
|
|
|
|
| 5021 |
<div class="cell-artifacts">
|
| 5022 |
<h4>Artifacts:</h4>
|
| 5023 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
|
|
| 5034 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5035 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5036 |
</span> |
|
| 5037 |
+
Cell: binned_run | deps: torch, numpy | 39.44s
|
| 5038 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5039 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5040 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5448 |
|
| 5449 |
Warming up (10 iterations)...
|
| 5450 |
Benchmarking (50 iterations)...
|
| 5451 |
+
Progress: 20% complete (avg: 37.889 ms)
|
| 5452 |
+
Progress: 40% complete (avg: 37.238 ms)
|
| 5453 |
+
Progress: 60% complete (avg: 36.997 ms)
|
| 5454 |
+
Progress: 80% complete (avg: 36.387 ms)
|
| 5455 |
|
| 5456 |
Output tensors:
|
| 5457 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 5461 |
Iterations: 50
|
| 5462 |
|
| 5463 |
Latency Statistics:
|
| 5464 |
+
Average: 35.833 ms
|
| 5465 |
+
Min: 32.582 ms
|
| 5466 |
+
Max: 40.501 ms
|
| 5467 |
+
Std Dev: 1.694 ms
|
| 5468 |
|
| 5469 |
Percentiles:
|
| 5470 |
+
P50 (median): 36.177 ms
|
| 5471 |
+
P95: 38.671 ms
|
| 5472 |
+
P99: 39.929 ms
|
| 5473 |
|
| 5474 |
Throughput:
|
| 5475 |
+
Tokens/sec: 2790.7
|
| 5476 |
+
Std Dev: 131.3
|
| 5477 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5478 |
|
| 5479 |
Saved benchmark results to binned_results.json
|
|
|
|
| 5483 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5484 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5485 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 5486 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
| 5487 |
Downloading sympy (6.0MiB)
|
| 5488 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5489 |
+
Downloading setuptools (1.1MiB)
|
| 5490 |
+
Downloading numpy (16.2MiB)
|
| 5491 |
+
Downloading networkx (1.9MiB)
|
| 5492 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5493 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5494 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5495 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
| 5496 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5497 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
| 5498 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5499 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5500 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5501 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5502 |
Downloading torch (846.9MiB)
|
| 5503 |
+
Downloading triton (148.3MiB)
|
| 5504 |
Downloading nvidia-cufile-cu12
|
| 5505 |
Downloading setuptools
|
| 5506 |
Downloading networkx
|
|
|
|
| 5519 |
Downloading nvidia-cublas-cu12
|
| 5520 |
Downloading nvidia-cudnn-cu12
|
| 5521 |
Downloading torch
|
| 5522 |
+
Installed 26 packages in 446ms
|
| 5523 |
</div>
|
| 5524 |
</div>
|
| 5525 |
<div class="cell-artifacts">
|
|
|
|
| 5538 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5539 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5540 |
</span> |
|
| 5541 |
+
Cell: gptoss_run | deps: torch, numpy | 40.46s
|
| 5542 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5543 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5544 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5856 |
|
| 5857 |
Warming up (10 iterations)...
|
| 5858 |
Benchmarking (50 iterations)...
|
| 5859 |
+
Progress: 20% complete (avg: 50.504 ms)
|
| 5860 |
+
Progress: 40% complete (avg: 50.045 ms)
|
| 5861 |
+
Progress: 60% complete (avg: 49.107 ms)
|
| 5862 |
+
Progress: 80% complete (avg: 48.012 ms)
|
| 5863 |
|
| 5864 |
Output tensors:
|
| 5865 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 5869 |
Iterations: 50
|
| 5870 |
|
| 5871 |
Latency Statistics:
|
| 5872 |
+
Average: 46.791 ms
|
| 5873 |
+
Min: 39.036 ms
|
| 5874 |
+
Max: 50.857 ms
|
| 5875 |
+
Std Dev: 3.251 ms
|
| 5876 |
|
| 5877 |
Percentiles:
|
| 5878 |
+
P50 (median): 47.476 ms
|
| 5879 |
+
P95: 50.806 ms
|
| 5880 |
+
P99: 50.839 ms
|
| 5881 |
|
| 5882 |
Throughput:
|
| 5883 |
+
Tokens/sec: 2137.2
|
| 5884 |
+
Std Dev: 155.2
|
| 5885 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5886 |
|
| 5887 |
Saved benchmark results to gptoss_results.json
|
|
|
|
| 5891 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5892 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5893 |
<div class="uv-logs-content" style="display: none;">
|
| 5894 |
+
Downloading setuptools (1.1MiB)
|
| 5895 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5896 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5897 |
+
Downloading numpy (16.2MiB)
|
| 5898 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5899 |
Downloading sympy (6.0MiB)
|
| 5900 |
+
Downloading torch (846.9MiB)
|
| 5901 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5902 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5903 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
| 5904 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5905 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5906 |
+
Downloading networkx (1.9MiB)
|
| 5907 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5908 |
Downloading triton (148.3MiB)
|
| 5909 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5910 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 5911 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5912 |
Downloading nvidia-cufile-cu12
|
| 5913 |
Downloading setuptools
|
| 5914 |
Downloading networkx
|
|
|
|
| 5927 |
Downloading nvidia-cublas-cu12
|
| 5928 |
Downloading nvidia-cudnn-cu12
|
| 5929 |
Downloading torch
|
| 5930 |
+
Installed 26 packages in 442ms
|
| 5931 |
</div>
|
| 5932 |
</div>
|
| 5933 |
<div class="cell-artifacts">
|
|
|
|
| 5946 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5947 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5948 |
</span> |
|
| 5949 |
+
Cell: gptoss_training_run | deps: torch, numpy | 39.65s
|
| 5950 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5951 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5952 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6247 |
|
| 6248 |
Warming up (10 iterations)...
|
| 6249 |
Benchmarking (50 iterations)...
|
| 6250 |
+
Progress: 20% complete (avg: 48.334 ms)
|
| 6251 |
+
Progress: 40% complete (avg: 47.917 ms)
|
| 6252 |
+
Progress: 60% complete (avg: 47.077 ms)
|
| 6253 |
+
Progress: 80% complete (avg: 46.038 ms)
|
| 6254 |
|
| 6255 |
Output tensors:
|
| 6256 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 6260 |
Iterations: 50
|
| 6261 |
|
| 6262 |
Latency Statistics:
|
| 6263 |
+
Average: 45.007 ms
|
| 6264 |
+
Min: 38.837 ms
|
| 6265 |
+
Max: 49.308 ms
|
| 6266 |
+
Std Dev: 2.894 ms
|
| 6267 |
|
| 6268 |
Percentiles:
|
| 6269 |
+
P50 (median): 45.575 ms
|
| 6270 |
+
P95: 48.573 ms
|
| 6271 |
+
P99: 48.964 ms
|
| 6272 |
|
| 6273 |
Throughput:
|
| 6274 |
+
Tokens/sec: 2221.9
|
| 6275 |
+
Std Dev: 147.9
|
| 6276 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6277 |
|
| 6278 |
Saved benchmark results to gptoss_training_results.json
|
|
|
|
| 6282 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6283 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6284 |
<div class="uv-logs-content" style="display: none;">
|
| 6285 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6286 |
+
Downloading networkx (1.9MiB)
|
| 6287 |
+
Downloading setuptools (1.1MiB)
|
| 6288 |
Downloading numpy (16.2MiB)
|
| 6289 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6290 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6291 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6292 |
+
Downloading sympy (6.0MiB)
|
| 6293 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6294 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 6295 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6296 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
| 6297 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6298 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6299 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6300 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6301 |
+
Downloading triton (148.3MiB)
|
| 6302 |
Downloading torch (846.9MiB)
|
|
|
|
| 6303 |
Downloading nvidia-cufile-cu12
|
| 6304 |
Downloading setuptools
|
| 6305 |
Downloading networkx
|
|
|
|
| 6318 |
Downloading nvidia-cublas-cu12
|
| 6319 |
Downloading nvidia-cudnn-cu12
|
| 6320 |
Downloading torch
|
| 6321 |
+
Installed 26 packages in 448ms
|
| 6322 |
</div>
|
| 6323 |
</div>
|
| 6324 |
<div class="cell-artifacts">
|
|
|
|
| 6337 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6338 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6339 |
</span> |
|
| 6340 |
+
Cell: megablocks_run | deps: torch, numpy, kernels | 41.38s | FAILED
|
| 6341 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6342 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6343 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6492 |
<span class="c1"># Attach loaded expert weights to the experts container</span>
|
| 6493 |
<span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
|
| 6494 |
<span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
|
| 6495 |
+
<span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">64</span>
|
| 6496 |
<span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
| 6497 |
<span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
| 6498 |
<span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
|
|
|
|
| 6569 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6570 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6571 |
<div class="uv-logs-content" style="display: none;">
|
| 6572 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6573 |
+
Downloading setuptools (1.1MiB)
|
| 6574 |
+
Downloading numpy (16.2MiB)
|
| 6575 |
Downloading networkx (1.9MiB)
|
| 6576 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
| 6577 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6578 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6579 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6580 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6581 |
+
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6582 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6583 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6584 |
Downloading hf-xet (3.0MiB)
|
| 6585 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6586 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6587 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6588 |
+
Downloading triton (148.3MiB)
|
| 6589 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6590 |
+
Downloading sympy (6.0MiB)
|
| 6591 |
Downloading nvidia-cufile-cu12
|
| 6592 |
Downloading hf-xet
|
| 6593 |
Downloading setuptools
|
|
|
|
| 6607 |
Downloading nvidia-cublas-cu12
|
| 6608 |
Downloading nvidia-cudnn-cu12
|
| 6609 |
Downloading torch
|
| 6610 |
+
Installed 37 packages in 543ms
|
| 6611 |
</div>
|
| 6612 |
</div>
|
| 6613 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6614 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:27, 2.39it/s]
|
| 6615 |
+
Fetching 66 files: 6%|▌ | 4/66 [00:00<00:07, 8.04it/s]
|
| 6616 |
+
Fetching 66 files: 17%|█▋ | 11/66 [00:00<00:02, 21.45it/s]
|
| 6617 |
+
Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:02, 17.15it/s]
|
| 6618 |
+
Fetching 66 files: 48%|████▊ | 32/66 [00:01<00:01, 30.72it/s]
|
| 6619 |
+
Fetching 66 files: 62%|██████▏ | 41/66 [00:01<00:01, 23.83it/s]
|
| 6620 |
+
Fetching 66 files: 71%|███████ | 47/66 [00:02<00:00, 25.88it/s]
|
| 6621 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:02<00:00, 45.13it/s]
|
| 6622 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:02<00:00, 29.34it/s]
|
| 6623 |
+
/tmp/tmpq5pei8xr/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
|
|
|
|
|
|
|
| 6624 |
5 | #include <Python.h>
|
| 6625 |
| ^~~~~~~~~~
|
| 6626 |
compilation terminated.
|
|
|
|
| 6637 |
File "/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py", line 177, in <lambda>
|
| 6638 |
call = lambda x: fn(x, *args[1:], **kwargs)
|
| 6639 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6640 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
|
| 6641 |
return self._call_impl(*args, **kwargs)
|
| 6642 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6643 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
|
| 6644 |
return forward_call(*args, **kwargs)
|
| 6645 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6646 |
File "/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py", line 81, in forward
|
| 6647 |
output, dummy_routing_weights = self.model(hidden_states)
|
| 6648 |
^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6649 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
|
| 6650 |
return self._call_impl(*args, **kwargs)
|
| 6651 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6652 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
|
| 6653 |
return forward_call(*args, **kwargs)
|
| 6654 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6655 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py", line 896, in forward
|
| 6656 |
output, expert_weights_out, *_ = moe_forward(
|
| 6657 |
^^^^^^^^^^^^
|
| 6658 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py", line 730, in moe_forward
|
| 6659 |
x, tokens_per_expert = forward_fn(**forward_args)
|
| 6660 |
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6661 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py", line 457, in forward_once
|
| 6662 |
x = permute_and_compute(
|
| 6663 |
^^^^^^^^^^^^^^^^^^^^
|
| 6664 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py", line 401, in permute_and_compute
|
| 6665 |
x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
|
| 6666 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6667 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/autograd/function.py", line 576, in apply
|
| 6668 |
return super().apply(*args, **kwargs) # type: ignore[misc]
|
| 6669 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6670 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py", line 30, in decorate_fwd
|
| 6671 |
return fwd(*args, **kwargs)
|
| 6672 |
^^^^^^^^^^^^^^^^^^^^
|
| 6673 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py", line 26, in forward
|
| 6674 |
return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
|
| 6675 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6676 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py", line 419, in binned_gather
|
| 6677 |
_binned_copy[(num_experts, expert_capacity)](
|
| 6678 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/jit.py", line 390, in <lambda>
|
| 6679 |
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
|
| 6680 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6681 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 239, in run
|
| 6682 |
benchmark()
|
| 6683 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 228, in benchmark
|
| 6684 |
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
|
| 6685 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6686 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 228, in <dictcomp>
|
| 6687 |
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
|
| 6688 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6689 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 160, in _bench
|
| 6690 |
return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
|
| 6691 |
^^^^^^^^^^^^^
|
| 6692 |
File "/usr/lib/python3.11/functools.py", line 1001, in __get__
|
| 6693 |
val = self.func(instance)
|
| 6694 |
^^^^^^^^^^^^^^^^^^^
|
| 6695 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py", line 121, in do_bench
|
| 6696 |
return driver.active.get_benchmarker()
|
| 6697 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6698 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py", line 30, in __getattr__
|
| 6699 |
return getattr(self._initialize_obj(), name)
|
| 6700 |
^^^^^^^^^^^^^^^^^^^^^^
|
| 6701 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py", line 26, in _initialize_obj
|
| 6702 |
self._obj = self._init_fn()
|
| 6703 |
^^^^^^^^^^^^^^^
|
| 6704 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py", line 12, in _create_driver
|
| 6705 |
return active_drivers[0]()
|
| 6706 |
^^^^^^^^^^^^^^^^^^^
|
| 6707 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py", line 715, in __init__
|
| 6708 |
self.utils = CudaUtils() # TODO: make static
|
| 6709 |
^^^^^^^^^^^
|
| 6710 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py", line 62, in __init__
|
| 6711 |
mod = compile_module_from_src(
|
| 6712 |
^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6713 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py", line 88, in compile_module_from_src
|
| 6714 |
so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
|
| 6715 |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 6716 |
+
File "/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py", line 51, in _build
|
| 6717 |
subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
|
| 6718 |
File "/usr/lib/python3.11/subprocess.py", line 413, in check_call
|
| 6719 |
raise CalledProcessError(retcode, cmd)
|
| 6720 |
+
subprocess.CalledProcessError: Command '['/usr/bin/gcc', '/tmp/tmpq5pei8xr/cuda_utils.c', '-O3', '-shared', '-fPIC', '-Wno-psabi', '-o', '/tmp/tmpq5pei8xr/cuda_utils.cpython-311-x86_64-linux-gnu.so', '-lcuda', '-L/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/lib', '-L/usr/lib/x86_64-linux-gnu', '-I/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/include', '-I/tmp/tmpq5pei8xr', '-I/usr/include/python3.11']' returned non-zero exit status 1.</div>
|
| 6721 |
</div>
|
| 6722 |
</div>
|
| 6723 |
|