drbh HF Staff commited on
Commit
ed9a6af
·
verified ·
1 Parent(s): 73f8595

Upload folder using huggingface_hub

Browse files
megablocks/cells/forward_only.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "accelerate>=1.10.1",
5
+ # "torch>=2.7.0",
6
+ # "kernels==0.10.0",
7
+ # "transformers@https://github.com/huggingface/transformers.git",
8
+ # "ipdb>=0.13.13",
9
+ # "matplotlib>=3.7.2",
10
+ # "numpy>=1.24.3",
11
+ # ]
12
+ # ///
13
+
14
+ import torch
15
+ from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
16
+ import time
17
+ import torch.nn as nn
18
+ from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
19
+ import sys
20
+ import torch.profiler
21
+ import gc
22
+ import logging
23
+ from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
24
+
25
+
26
+ replace_kernel_forward_from_hub(GptOssRMSNorm, None)
27
+
28
+ # set to debug logging
29
+ logging.basicConfig(level=logging.INFO)
30
+
31
+ def reset_peak_memory_stats():
32
+ """Clear CUDA cache and reset memory allocation counters."""
33
+ torch.cuda.empty_cache()
34
+ if torch.cuda.is_available():
35
+ torch.cuda.reset_peak_memory_stats()
36
+ gc.collect()
37
+
38
+ def get_memory_stats():
39
+ """Get current and peak CUDA memory usage."""
40
+ if not torch.cuda.is_available():
41
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
42
+ return {
43
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
44
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
45
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
46
+ }
47
+
48
+ def override_kernel_layer_name(cls_name: str, value) -> bool:
49
+ """Helper to dynamically override the kernel_layer_name in a model class."""
50
+ for mod in sys.modules.values():
51
+ if mod is None:
52
+ continue
53
+ obj = getattr(mod, cls_name, None)
54
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
55
+ setattr(obj, "kernel_layer_name", value)
56
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
57
+ return True
58
+ return False
59
+
60
+
61
+ # Init the model the normal way
62
+ model_id = "openai/gpt-oss-20b"
63
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
64
+ quantization_config = Mxfp4Config(dequantize=True)
65
+
66
+
67
+
68
+ model = GptOssForCausalLM.from_pretrained(
69
+ model_id,
70
+ dtype="bfloat16",
71
+ device_map="auto",
72
+ use_kernels=True,
73
+ quantization_config=quantization_config,
74
+ ).eval()
75
+
76
+ messages = [
77
+ {"role": "system", "content": "What is Tensor Parallelism?"},
78
+ ]
79
+
80
+ inputs = tokenizer.apply_chat_template(
81
+ messages,
82
+ add_generation_prompt=True,
83
+ return_tensors="pt",
84
+ return_dict=True,
85
+ reasoning_effort="low",
86
+ ).to("cuda")
87
+
88
+ max_tokens = 256
89
+
90
+ with torch.inference_mode():
91
+ start_time = time.perf_counter()
92
+ generated = model.generate(
93
+ **inputs,
94
+ max_new_tokens=max_tokens,
95
+ do_sample=False,
96
+ temperature=None,
97
+ )
98
+ end_time = time.perf_counter()
99
+
100
+ print(tokenizer.decode(generated[0], skip_special_tokens=False))
101
+ print(f"Generation took {end_time - start_time:.2f} seconds")
megablocks/cells/nv.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import subprocess
2
+
3
+ print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
megablocks/megablocks_only.html CHANGED
@@ -3715,7 +3715,74 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3715
  </div>
3716
 
3717
  <div class="main-content">
3718
- <h1>No Kernels</h1>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3719
  <p>First, we run the model without any custom kernels to get a reference point.</p>
3720
  <h2>Forward</h2>
3721
  <div class="cell" id="cell-no_kernels">
@@ -3725,7 +3792,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3725
  <span onclick="toggleOutput('no_kernels')" style="cursor: pointer;">▼ output</span>
3726
  <span id="uv-indicator-no_kernels" onclick="toggleUvLogsFromHeader('no_kernels')" style="cursor: pointer;">▶ uv-logs</span>
3727
  </span> |
3728
- Cell: no_kernels | 106.70s
3729
  | <button class="run-btn" onclick="runCell('no_kernels')">▶ run</button>
3730
  <button class="copy-btn" onclick="copyCell('no_kernels')">Copy</button>
3731
  <a href="cells/no_kernels.py" target="_blank" class="raw-btn">Raw</a>
@@ -3972,37 +4039,37 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
3972
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
3973
  Updating https://github.com/huggingface/transformers.git (HEAD)
3974
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
 
3975
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
3976
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3977
- Downloading numpy (15.9MiB)
3978
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3979
- Downloading fonttools (4.7MiB)
3980
  Downloading hf-xet (3.0MiB)
3981
- Downloading networkx (1.9MiB)
3982
- Downloading tokenizers (3.1MiB)
3983
- Downloading nvidia-cudnn-cu12 (674.0MiB)
3984
- Downloading triton (148.4MiB)
3985
- Downloading pygments (1.2MiB)
3986
- Downloading kiwisolver (1.4MiB)
3987
- Downloading nvidia-curand-cu12 (60.7MiB)
3988
  Downloading pillow (6.3MiB)
3989
- Downloading nvidia-cusparse-cu12 (274.9MiB)
3990
  Downloading nvidia-cufile-cu12 (1.1MiB)
3991
- Downloading sympy (6.0MiB)
3992
- Downloading jedi (1.5MiB)
3993
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3994
  Downloading nvidia-nccl-cu12 (307.4MiB)
3995
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
 
3996
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
 
3997
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3998
- Downloading nvidia-cublas-cu12 (566.8MiB)
3999
- Downloading torch (846.8MiB)
 
4000
  Downloading matplotlib (8.3MiB)
 
 
4001
  Downloading nvidia-cufile-cu12
4002
  Downloading kiwisolver
4003
  Downloading pygments
4004
- Downloading tokenizers
4005
  Downloading hf-xet
 
4006
  Downloading networkx
4007
  Downloading fonttools
4008
  Downloading pillow
@@ -4012,8 +4079,8 @@ Downloading matplotlib (8.3MiB)
4012
  Downloading sympy
4013
  Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4014
  Downloading nvidia-nvjitlink-cu12
4015
- Downloading jedi
4016
  Downloading nvidia-curand-cu12
 
4017
  Downloading nvidia-cuda-nvrtc-cu12
4018
  Downloading triton
4019
  Downloading nvidia-cufft-cu12
@@ -4024,13 +4091,13 @@ Downloading matplotlib (8.3MiB)
4024
  Downloading nvidia-cublas-cu12
4025
  Downloading nvidia-cudnn-cu12
4026
  Downloading torch
4027
- Installed 69 packages in 466ms
4028
  </div>
4029
  </div>
4030
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4031
- Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:14, 7.16s/it]
4032
- Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.83s/it]
4033
- Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.89s/it]
4034
 
4035
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4036
  Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.35s/it]
@@ -4049,7 +4116,7 @@ Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00
4049
  <span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
4050
  <span id="uv-indicator-forward_and_backward_no_kernel" onclick="toggleUvLogsFromHeader('forward_and_backward_no_kernel')" style="cursor: pointer;">▶ uv-logs</span>
4051
  </span> |
4052
- Cell: forward_and_backward_no_kernel | 98.96s | FAILED
4053
  | <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
4054
  <button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
4055
  <a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
@@ -4475,14 +4542,14 @@ What is Tensor Parallelism?
4475
  ## 1. Why Tensor Parallelism?
4476
 
4477
  - **Memory constraints**: Modern
4478
- Generation took 13.16 seconds
4479
  Post-generation memory: {&#x27;allocated_gb&#x27;: 9.398670336, &#x27;peak_gb&#x27;: 9.514059776, &#x27;reserved_gb&#x27;: 17.188257792}
4480
  Enabled gradient checkpointing
4481
  Post-forward memory: {&#x27;allocated_gb&#x27;: 9.487933952, &#x27;peak_gb&#x27;: 9.514059776, &#x27;reserved_gb&#x27;: 17.188257792}
4482
  Loss: 1.9761
4483
  Running backward pass...
4484
  Pre-backward memory: {&#x27;allocated_gb&#x27;: 9.405890048, &#x27;peak_gb&#x27;: 9.514059776, &#x27;reserved_gb&#x27;: 17.177772032}
4485
- OOM during forward/backward pass: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 68744 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
4486
  Try reducing max_tokens or max_seq_len
4487
  </div>
4488
  <div class="uv-install-logs" id="uv-logs-forward_and_backward_no_kernel">
@@ -4492,37 +4559,37 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
4492
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4493
  Updating https://github.com/huggingface/transformers.git (HEAD)
4494
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4495
- Downloading networkx (1.9MiB)
 
4496
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4497
  Downloading pygments (1.2MiB)
4498
- Downloading nvidia-cufile-cu12 (1.1MiB)
4499
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4500
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4501
- Downloading nvidia-nccl-cu12 (307.4MiB)
4502
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4503
  Downloading jedi (1.5MiB)
4504
- Downloading sympy (6.0MiB)
4505
- Downloading nvidia-curand-cu12 (60.7MiB)
4506
  Downloading hf-xet (3.0MiB)
 
 
4507
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
 
 
4508
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4509
- Downloading numpy (15.9MiB)
4510
- Downloading pillow (6.3MiB)
4511
  Downloading nvidia-cublas-cu12 (566.8MiB)
4512
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4513
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
4514
  Downloading triton (148.4MiB)
 
 
 
 
4515
  Downloading kiwisolver (1.4MiB)
4516
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4517
- Downloading tokenizers (3.1MiB)
4518
- Downloading matplotlib (8.3MiB)
4519
- Downloading fonttools (4.7MiB)
4520
- Downloading torch (846.8MiB)
4521
  Downloading nvidia-cufile-cu12
4522
  Downloading kiwisolver
4523
  Downloading pygments
4524
- Downloading hf-xet
4525
  Downloading tokenizers
 
4526
  Downloading networkx
4527
  Downloading fonttools
4528
  Downloading pillow
@@ -4532,28 +4599,28 @@ Downloading torch (846.8MiB)
4532
  Downloading sympy
4533
  Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4534
  Downloading nvidia-nvjitlink-cu12
4535
- Downloading jedi
4536
  Downloading nvidia-curand-cu12
 
4537
  Downloading nvidia-cuda-nvrtc-cu12
4538
  Downloading triton
4539
  Downloading nvidia-cufft-cu12
4540
  Downloading nvidia-cusolver-cu12
4541
- Downloading nvidia-cusparselt-cu12
4542
  Downloading nvidia-cusparse-cu12
 
4543
  Downloading nvidia-nccl-cu12
4544
  Downloading nvidia-cublas-cu12
4545
  Downloading nvidia-cudnn-cu12
4546
  Downloading torch
4547
- Installed 69 packages in 462ms
4548
  </div>
4549
  </div>
4550
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4551
- Fetching 3 files: 33%|███▎ | 1/3 [00:06&lt;00:13, 6.93s/it]
4552
- Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.58s/it]
4553
- Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.72s/it]
4554
 
4555
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4556
- Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.35s/it]
4557
  Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4558
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4559
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
@@ -4562,14 +4629,14 @@ Traceback (most recent call last):
4562
  File &quot;/repo/moe_benchmarks/megablocks/.uvnote/cells/forward_and_backward_no_kernel.py&quot;, line 154, in &lt;module&gt;
4563
  loss.backward()
4564
  ~~~~~~~~~~~~~^^
4565
- File &quot;/tmp/uvnote-run-87d0yuj5/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/_tensor.py&quot;, line 647, in backward
4566
  torch.autograd.backward(
4567
  ~~~~~~~~~~~~~~~~~~~~~~~^
4568
  self, gradient, retain_graph, create_graph, inputs=inputs
4569
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4570
  )
4571
  ^
4572
- File &quot;/tmp/uvnote-run-87d0yuj5/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py&quot;, line 354, in backward
4573
  _engine_run_backward(
4574
  ~~~~~~~~~~~~~~~~~~~~^
4575
  tensors,
@@ -4579,19 +4646,19 @@ Traceback (most recent call last):
4579
  ^^^^^^^^^^^^^^^^^^^^^
4580
  )
4581
  ^
4582
- File &quot;/tmp/uvnote-run-87d0yuj5/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py&quot;, line 829, in _engine_run_backward
4583
  return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
4584
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4585
  t_outputs, *args, **kwargs
4586
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
4587
  ) # Calls into the C++ engine to run the backward pass
4588
  ^
4589
- File &quot;/tmp/uvnote-run-87d0yuj5/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/function.py&quot;, line 311, in apply
4590
  return user_fn(self, *args)
4591
- File &quot;/tmp/uvnote-run-87d0yuj5/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/utils/checkpoint.py&quot;, line 319, in backward
4592
  torch.autograd.backward(outputs_with_grad, args_with_grad)
4593
  ~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4594
- File &quot;/tmp/uvnote-run-87d0yuj5/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py&quot;, line 354, in backward
4595
  _engine_run_backward(
4596
  ~~~~~~~~~~~~~~~~~~~~^
4597
  tensors,
@@ -4601,14 +4668,14 @@ Traceback (most recent call last):
4601
  ^^^^^^^^^^^^^^^^^^^^^
4602
  )
4603
  ^
4604
- File &quot;/tmp/uvnote-run-87d0yuj5/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py&quot;, line 829, in _engine_run_backward
4605
  return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
4606
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4607
  t_outputs, *args, **kwargs
4608
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
4609
  ) # Calls into the C++ engine to run the backward pass
4610
  ^
4611
- torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 68744 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)</div>
4612
  </div>
4613
  </div>
4614
 
@@ -4616,6 +4683,384 @@ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2
4616
  <p>Next we can run with Megablocks kernels enabled.</p>
4617
  <h3>Forward</h3>
4618
  <p>First, we run a forward pass with Megablocks kernels.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4619
  <h2>Forward and Backward</h2>
4620
  <p>Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.</p>
4621
  <div class="cell" id="cell-forward_and_backward">
@@ -4625,7 +5070,7 @@ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2
4625
  <span onclick="toggleOutput('forward_and_backward')" style="cursor: pointer;">▼ output</span>
4626
  <span id="uv-indicator-forward_and_backward" onclick="toggleUvLogsFromHeader('forward_and_backward')" style="cursor: pointer;">▶ uv-logs</span>
4627
  </span> |
4628
- Cell: forward_and_backward | 106.33s
4629
  | <button class="run-btn" onclick="runCell('forward_and_backward')">▶ run</button>
4630
  <button class="copy-btn" onclick="copyCell('forward_and_backward')">Copy</button>
4631
  <a href="cells/forward_and_backward.py" target="_blank" class="raw-btn">Raw</a>
@@ -5045,7 +5490,7 @@ Reasoning: low
5045
  What is Tensor Parallelism?
5046
 
5047
  &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it&#x27;s used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it&#x27;s
5048
- Generation took 17.99 seconds
5049
  Post-generation memory: {&#x27;allocated_gb&#x27;: 9.398670336, &#x27;peak_gb&#x27;: 9.67278848, &#x27;reserved_gb&#x27;: 17.188257792}
5050
  Enabled gradient checkpointing
5051
  Post-forward memory: {&#x27;allocated_gb&#x27;: 9.487933952, &#x27;peak_gb&#x27;: 9.67278848, &#x27;reserved_gb&#x27;: 17.188257792}
@@ -5076,85 +5521,86 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
5076
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
5077
  Updating https://github.com/huggingface/transformers.git (HEAD)
5078
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
 
 
 
 
 
 
 
5079
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
 
 
 
5080
  Downloading networkx (1.9MiB)
5081
- Downloading pygments (1.2MiB)
5082
- Downloading jedi (1.5MiB)
5083
  Downloading nvidia-cufile-cu12 (1.1MiB)
5084
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5085
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
5086
- Downloading nvidia-cusparse-cu12 (274.9MiB)
5087
- Downloading nvidia-cufft-cu12 (184.2MiB)
5088
- Downloading numpy (15.9MiB)
5089
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5090
- Downloading hf-xet (3.0MiB)
5091
  Downloading pillow (6.3MiB)
5092
- Downloading nvidia-curand-cu12 (60.7MiB)
5093
- Downloading sympy (6.0MiB)
 
5094
  Downloading nvidia-cublas-cu12 (566.8MiB)
5095
- Downloading nvidia-cusolver-cu12 (255.1MiB)
5096
- Downloading nvidia-cudnn-cu12 (674.0MiB)
5097
  Downloading nvidia-nccl-cu12 (307.4MiB)
5098
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5099
- Downloading matplotlib (8.3MiB)
5100
- Downloading tokenizers (3.1MiB)
5101
- Downloading torch (846.8MiB)
5102
  Downloading kiwisolver (1.4MiB)
5103
- Downloading fonttools (4.7MiB)
 
 
 
5104
  Downloading triton (148.4MiB)
5105
  Downloading nvidia-cufile-cu12
5106
  Downloading kiwisolver
5107
  Downloading pygments
5108
- Downloading networkx
5109
  Downloading hf-xet
5110
  Downloading tokenizers
5111
- Downloading jedi
5112
  Downloading fonttools
5113
- Downloading sympy
5114
  Downloading pillow
5115
  Downloading matplotlib
5116
  Downloading nvidia-cuda-cupti-cu12
5117
  Downloading numpy
5118
- Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
5119
  Downloading nvidia-nvjitlink-cu12
 
 
5120
  Downloading nvidia-curand-cu12
5121
  Downloading nvidia-cuda-nvrtc-cu12
5122
  Downloading triton
5123
  Downloading nvidia-cufft-cu12
5124
  Downloading nvidia-cusolver-cu12
5125
- Downloading nvidia-cusparselt-cu12
5126
  Downloading nvidia-cusparse-cu12
 
5127
  Downloading nvidia-nccl-cu12
5128
  Downloading nvidia-cublas-cu12
5129
  Downloading nvidia-cudnn-cu12
5130
  Downloading torch
5131
- Installed 69 packages in 468ms
5132
  </div>
5133
  </div>
5134
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
5135
- Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:15, 7.67s/it]
5136
- Fetching 3 files: 67%|██████▋ | 2/3 [00:09&lt;00:04, 4.14s/it]
5137
- Fetching 3 files: 100%|██████████| 3/3 [00:09&lt;00:00, 3.11s/it]
5138
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
5139
 
5140
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
5141
- Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.35s/it]
5142
- Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.26s/it]
5143
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
5144
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
5145
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5146
 
5147
  Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
5148
- Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:13, 4.77it/s]
5149
- Fetching 66 files: 14%|█▎ | 9/66 [00:00&lt;00:01, 29.85it/s]
5150
- Fetching 66 files: 20%|█▉ | 13/66 [00:00&lt;00:01, 30.78it/s]
5151
- Fetching 66 files: 26%|██▌ | 17/66 [00:01&lt;00:03, 14.47it/s]
5152
- Fetching 66 files: 61%|██████ | 40/66 [00:01&lt;00:00, 47.25it/s]
5153
- Fetching 66 files: 74%|███████▍ | 49/66 [00:01&lt;00:00, 38.19it/s]
5154
- Fetching 66 files: 85%|████████▍ | 56/66 [00:01&lt;00:00, 32.80it/s]
5155
- Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 41.39it/s]
5156
- Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 34.85it/s]
5157
- /tmp/uvnote-run-hy08fbjx/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
 
5158
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5159
  warnings.warn(
5160
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
@@ -5181,7 +5627,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
5181
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5182
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5183
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5184
- /tmp/uvnote-run-hy08fbjx/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5185
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5186
  warnings.warn(
5187
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
@@ -5208,7 +5654,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
5208
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5209
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5210
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5211
- /tmp/uvnote-run-hy08fbjx/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5212
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5213
  warnings.warn(
5214
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
@@ -5236,7 +5682,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
5236
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5237
  `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
5238
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5239
- /tmp/uvnote-run-hy08fbjx/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5240
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5241
  warnings.warn(
5242
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
 
3715
  </div>
3716
 
3717
  <div class="main-content">
3718
+ <div class="cell" id="cell-nv">
3719
+ <div class="cell-header">
3720
+ <span class="collapse-indicators">
3721
+ <span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
3722
+ <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3723
+ <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3724
+ </span> |
3725
+ Cell: nv | 0.71s
3726
+ | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3727
+ <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3728
+ <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
3729
+ </div>
3730
+ <div id="code-nv" class="cell-code" data-lines="3">
3731
+ <div class="highlight-with-lines">
3732
+ <div class="line-numbers" id="lines-nv">
3733
+ <a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
3734
+ <a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
3735
+ <a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
3736
+ </div>
3737
+ <div class="code-wrap">
3738
+ <div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
3739
+
3740
+ <span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
3741
+ </pre></div>
3742
+
3743
+ <div class="code-line-highlight" id="line-highlight-nv"></div>
3744
+ </div>
3745
+ </div>
3746
+ </div>
3747
+ <div id="output-nv" class="cell-output">
3748
+ <div class="cell-stdout">Wed Sep 24 20:58:22 2025
3749
+ +-----------------------------------------------------------------------------------------+
3750
+ | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3751
+ |-----------------------------------------+------------------------+----------------------+
3752
+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
3753
+ | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
3754
+ | | | MIG M. |
3755
+ |=========================================+========================+======================|
3756
+ | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3757
+ | 0% 32C P8 27W / 300W | 0MiB / 23028MiB | 0% Default |
3758
+ | | | N/A |
3759
+ +-----------------------------------------+------------------------+----------------------+
3760
+ | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3761
+ | 0% 32C P8 25W / 300W | 0MiB / 23028MiB | 0% Default |
3762
+ | | | N/A |
3763
+ +-----------------------------------------+------------------------+----------------------+
3764
+ | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3765
+ | 0% 32C P8 28W / 300W | 0MiB / 23028MiB | 0% Default |
3766
+ | | | N/A |
3767
+ +-----------------------------------------+------------------------+----------------------+
3768
+ | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3769
+ | 0% 32C P8 27W / 300W | 0MiB / 23028MiB | 0% Default |
3770
+ | | | N/A |
3771
+ +-----------------------------------------+------------------------+----------------------+
3772
+
3773
+ +-----------------------------------------------------------------------------------------+
3774
+ | Processes: |
3775
+ | GPU GI CI PID Type Process name GPU Memory |
3776
+ | ID ID Usage |
3777
+ |=========================================================================================|
3778
+ | No running processes found |
3779
+ +-----------------------------------------------------------------------------------------+
3780
+
3781
+ </div>
3782
+ </div>
3783
+ </div>
3784
+
3785
+ <h1>No Kernels</h1>
3786
  <p>First, we run the model without any custom kernels to get a reference point.</p>
3787
  <h2>Forward</h2>
3788
  <div class="cell" id="cell-no_kernels">
 
3792
  <span onclick="toggleOutput('no_kernels')" style="cursor: pointer;">▼ output</span>
3793
  <span id="uv-indicator-no_kernels" onclick="toggleUvLogsFromHeader('no_kernels')" style="cursor: pointer;">▶ uv-logs</span>
3794
  </span> |
3795
+ Cell: no_kernels | 107.24s
3796
  | <button class="run-btn" onclick="runCell('no_kernels')">▶ run</button>
3797
  <button class="copy-btn" onclick="copyCell('no_kernels')">Copy</button>
3798
  <a href="cells/no_kernels.py" target="_blank" class="raw-btn">Raw</a>
 
4039
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4040
  Updating https://github.com/huggingface/transformers.git (HEAD)
4041
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4042
+ Downloading jedi (1.5MiB)
4043
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4044
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4045
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
4046
  Downloading hf-xet (3.0MiB)
4047
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4048
+ Downloading sympy (6.0MiB)
 
 
 
 
 
4049
  Downloading pillow (6.3MiB)
 
4050
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
 
4051
  Downloading nvidia-nccl-cu12 (307.4MiB)
4052
+ Downloading numpy (15.9MiB)
4053
+ Downloading fonttools (4.7MiB)
4054
+ Downloading networkx (1.9MiB)
4055
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4056
+ Downloading triton (148.4MiB)
4057
  Downloading nvidia-cufft-cu12 (184.2MiB)
4058
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4059
+ Downloading nvidia-curand-cu12 (60.7MiB)
4060
+ Downloading tokenizers (3.1MiB)
4061
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4062
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4063
+ Downloading pygments (1.2MiB)
4064
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4065
  Downloading matplotlib (8.3MiB)
4066
+ Downloading kiwisolver (1.4MiB)
4067
+ Downloading torch (846.8MiB)
4068
  Downloading nvidia-cufile-cu12
4069
  Downloading kiwisolver
4070
  Downloading pygments
 
4071
  Downloading hf-xet
4072
+ Downloading tokenizers
4073
  Downloading networkx
4074
  Downloading fonttools
4075
  Downloading pillow
 
4079
  Downloading sympy
4080
  Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4081
  Downloading nvidia-nvjitlink-cu12
 
4082
  Downloading nvidia-curand-cu12
4083
+ Downloading jedi
4084
  Downloading nvidia-cuda-nvrtc-cu12
4085
  Downloading triton
4086
  Downloading nvidia-cufft-cu12
 
4091
  Downloading nvidia-cublas-cu12
4092
  Downloading nvidia-cudnn-cu12
4093
  Downloading torch
4094
+ Installed 69 packages in 565ms
4095
  </div>
4096
  </div>
4097
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4098
+ Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:15, 7.69s/it]
4099
+ Fetching 3 files: 67%|██████▋ | 2/3 [00:09&lt;00:03, 3.95s/it]
4100
+ Fetching 3 files: 100%|██████████| 3/3 [00:09&lt;00:00, 3.00s/it]
4101
 
4102
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4103
  Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.35s/it]
 
4116
  <span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
4117
  <span id="uv-indicator-forward_and_backward_no_kernel" onclick="toggleUvLogsFromHeader('forward_and_backward_no_kernel')" style="cursor: pointer;">▶ uv-logs</span>
4118
  </span> |
4119
+ Cell: forward_and_backward_no_kernel | 99.86s | FAILED
4120
  | <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
4121
  <button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
4122
  <a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
 
4542
  ## 1. Why Tensor Parallelism?
4543
 
4544
  - **Memory constraints**: Modern
4545
+ Generation took 13.15 seconds
4546
  Post-generation memory: {&#x27;allocated_gb&#x27;: 9.398670336, &#x27;peak_gb&#x27;: 9.514059776, &#x27;reserved_gb&#x27;: 17.188257792}
4547
  Enabled gradient checkpointing
4548
  Post-forward memory: {&#x27;allocated_gb&#x27;: 9.487933952, &#x27;peak_gb&#x27;: 9.514059776, &#x27;reserved_gb&#x27;: 17.188257792}
4549
  Loss: 1.9761
4550
  Running backward pass...
4551
  Pre-backward memory: {&#x27;allocated_gb&#x27;: 9.405890048, &#x27;peak_gb&#x27;: 9.514059776, &#x27;reserved_gb&#x27;: 17.177772032}
4552
+ OOM during forward/backward pass: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 34932 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
4553
  Try reducing max_tokens or max_seq_len
4554
  </div>
4555
  <div class="uv-install-logs" id="uv-logs-forward_and_backward_no_kernel">
 
4559
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4560
  Updating https://github.com/huggingface/transformers.git (HEAD)
4561
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4562
+ Downloading numpy (15.9MiB)
4563
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4564
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4565
  Downloading pygments (1.2MiB)
 
 
 
 
 
4566
  Downloading jedi (1.5MiB)
4567
+ Downloading tokenizers (3.1MiB)
 
4568
  Downloading hf-xet (3.0MiB)
4569
+ Downloading sympy (6.0MiB)
4570
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4571
  Downloading nvidia-cufft-cu12 (184.2MiB)
4572
+ Downloading fonttools (4.7MiB)
4573
+ Downloading matplotlib (8.3MiB)
4574
+ Downloading networkx (1.9MiB)
4575
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4576
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
4577
  Downloading nvidia-cublas-cu12 (566.8MiB)
4578
+ Downloading torch (846.8MiB)
4579
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4580
+ Downloading nvidia-curand-cu12 (60.7MiB)
4581
  Downloading triton (148.4MiB)
4582
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4583
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4584
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4585
+ Downloading pillow (6.3MiB)
4586
  Downloading kiwisolver (1.4MiB)
4587
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
 
4588
  Downloading nvidia-cufile-cu12
4589
  Downloading kiwisolver
4590
  Downloading pygments
 
4591
  Downloading tokenizers
4592
+ Downloading hf-xet
4593
  Downloading networkx
4594
  Downloading fonttools
4595
  Downloading pillow
 
4599
  Downloading sympy
4600
  Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4601
  Downloading nvidia-nvjitlink-cu12
 
4602
  Downloading nvidia-curand-cu12
4603
+ Downloading jedi
4604
  Downloading nvidia-cuda-nvrtc-cu12
4605
  Downloading triton
4606
  Downloading nvidia-cufft-cu12
4607
  Downloading nvidia-cusolver-cu12
 
4608
  Downloading nvidia-cusparse-cu12
4609
+ Downloading nvidia-cusparselt-cu12
4610
  Downloading nvidia-nccl-cu12
4611
  Downloading nvidia-cublas-cu12
4612
  Downloading nvidia-cudnn-cu12
4613
  Downloading torch
4614
+ Installed 69 packages in 592ms
4615
  </div>
4616
  </div>
4617
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4618
+ Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:14, 7.40s/it]
4619
+ Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.77s/it]
4620
+ Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.88s/it]
4621
 
4622
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4623
+ Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.34s/it]
4624
  Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4625
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4626
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
 
4629
  File &quot;/repo/moe_benchmarks/megablocks/.uvnote/cells/forward_and_backward_no_kernel.py&quot;, line 154, in &lt;module&gt;
4630
  loss.backward()
4631
  ~~~~~~~~~~~~~^^
4632
+ File &quot;/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/_tensor.py&quot;, line 647, in backward
4633
  torch.autograd.backward(
4634
  ~~~~~~~~~~~~~~~~~~~~~~~^
4635
  self, gradient, retain_graph, create_graph, inputs=inputs
4636
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4637
  )
4638
  ^
4639
+ File &quot;/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py&quot;, line 354, in backward
4640
  _engine_run_backward(
4641
  ~~~~~~~~~~~~~~~~~~~~^
4642
  tensors,
 
4646
  ^^^^^^^^^^^^^^^^^^^^^
4647
  )
4648
  ^
4649
+ File &quot;/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py&quot;, line 829, in _engine_run_backward
4650
  return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
4651
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4652
  t_outputs, *args, **kwargs
4653
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
4654
  ) # Calls into the C++ engine to run the backward pass
4655
  ^
4656
+ File &quot;/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/function.py&quot;, line 311, in apply
4657
  return user_fn(self, *args)
4658
+ File &quot;/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/utils/checkpoint.py&quot;, line 319, in backward
4659
  torch.autograd.backward(outputs_with_grad, args_with_grad)
4660
  ~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4661
+ File &quot;/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/__init__.py&quot;, line 354, in backward
4662
  _engine_run_backward(
4663
  ~~~~~~~~~~~~~~~~~~~~^
4664
  tensors,
 
4668
  ^^^^^^^^^^^^^^^^^^^^^
4669
  )
4670
  ^
4671
+ File &quot;/tmp/uvnote-run-vo30j_xa/home/.cache/uv/environments-v2/forward-and-backward-no-kernel-349948fac2e1b63b/lib/python3.13/site-packages/torch/autograd/graph.py&quot;, line 829, in _engine_run_backward
4672
  return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
4673
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4674
  t_outputs, *args, **kwargs
4675
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
4676
  ) # Calls into the C++ engine to run the backward pass
4677
  ^
4678
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 34932 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)</div>
4679
  </div>
4680
  </div>
4681
 
 
4683
  <p>Next we can run with Megablocks kernels enabled.</p>
4684
  <h3>Forward</h3>
4685
  <p>First, we run a forward pass with Megablocks kernels.</p>
4686
+ <div class="cell" id="cell-forward_only">
4687
+ <div class="cell-header">
4688
+ <span class="collapse-indicators">
4689
+ <span onclick="toggleCode('forward_only')" style="cursor: pointer;">▼ code</span>
4690
+ <span onclick="toggleOutput('forward_only')" style="cursor: pointer;">▼ output</span>
4691
+ <span id="uv-indicator-forward_only" onclick="toggleUvLogsFromHeader('forward_only')" style="cursor: pointer;">▶ uv-logs</span>
4692
+ </span> |
4693
+ Cell: forward_only | 114.71s
4694
+ | <button class="run-btn" onclick="runCell('forward_only')">▶ run</button>
4695
+ <button class="copy-btn" onclick="copyCell('forward_only')">Copy</button>
4696
+ <a href="cells/forward_only.py" target="_blank" class="raw-btn">Raw</a>
4697
+ </div>
4698
+ <div id="code-forward_only" class="cell-code" data-lines="101">
4699
+ <div class="highlight-with-lines">
4700
+ <div class="line-numbers" id="lines-forward_only">
4701
+ <a class="line-number" data-cell="forward_only" data-line="1" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 1, true);">1</a>
4702
+ <a class="line-number" data-cell="forward_only" data-line="2" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 2, true);">2</a>
4703
+ <a class="line-number" data-cell="forward_only" data-line="3" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 3, true);">3</a>
4704
+ <a class="line-number" data-cell="forward_only" data-line="4" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 4, true);">4</a>
4705
+ <a class="line-number" data-cell="forward_only" data-line="5" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 5, true);">5</a>
4706
+ <a class="line-number" data-cell="forward_only" data-line="6" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 6, true);">6</a>
4707
+ <a class="line-number" data-cell="forward_only" data-line="7" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 7, true);">7</a>
4708
+ <a class="line-number" data-cell="forward_only" data-line="8" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 8, true);">8</a>
4709
+ <a class="line-number" data-cell="forward_only" data-line="9" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 9, true);">9</a>
4710
+ <a class="line-number" data-cell="forward_only" data-line="10" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 10, true);">10</a>
4711
+ <a class="line-number" data-cell="forward_only" data-line="11" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 11, true);">11</a>
4712
+ <a class="line-number" data-cell="forward_only" data-line="12" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 12, true);">12</a>
4713
+ <a class="line-number" data-cell="forward_only" data-line="13" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 13, true);">13</a>
4714
+ <a class="line-number" data-cell="forward_only" data-line="14" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 14, true);">14</a>
4715
+ <a class="line-number" data-cell="forward_only" data-line="15" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 15, true);">15</a>
4716
+ <a class="line-number" data-cell="forward_only" data-line="16" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 16, true);">16</a>
4717
+ <a class="line-number" data-cell="forward_only" data-line="17" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 17, true);">17</a>
4718
+ <a class="line-number" data-cell="forward_only" data-line="18" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 18, true);">18</a>
4719
+ <a class="line-number" data-cell="forward_only" data-line="19" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 19, true);">19</a>
4720
+ <a class="line-number" data-cell="forward_only" data-line="20" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 20, true);">20</a>
4721
+ <a class="line-number" data-cell="forward_only" data-line="21" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 21, true);">21</a>
4722
+ <a class="line-number" data-cell="forward_only" data-line="22" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 22, true);">22</a>
4723
+ <a class="line-number" data-cell="forward_only" data-line="23" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 23, true);">23</a>
4724
+ <a class="line-number" data-cell="forward_only" data-line="24" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 24, true);">24</a>
4725
+ <a class="line-number" data-cell="forward_only" data-line="25" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 25, true);">25</a>
4726
+ <a class="line-number" data-cell="forward_only" data-line="26" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 26, true);">26</a>
4727
+ <a class="line-number" data-cell="forward_only" data-line="27" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 27, true);">27</a>
4728
+ <a class="line-number" data-cell="forward_only" data-line="28" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 28, true);">28</a>
4729
+ <a class="line-number" data-cell="forward_only" data-line="29" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 29, true);">29</a>
4730
+ <a class="line-number" data-cell="forward_only" data-line="30" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 30, true);">30</a>
4731
+ <a class="line-number" data-cell="forward_only" data-line="31" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 31, true);">31</a>
4732
+ <a class="line-number" data-cell="forward_only" data-line="32" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 32, true);">32</a>
4733
+ <a class="line-number" data-cell="forward_only" data-line="33" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 33, true);">33</a>
4734
+ <a class="line-number" data-cell="forward_only" data-line="34" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 34, true);">34</a>
4735
+ <a class="line-number" data-cell="forward_only" data-line="35" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 35, true);">35</a>
4736
+ <a class="line-number" data-cell="forward_only" data-line="36" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 36, true);">36</a>
4737
+ <a class="line-number" data-cell="forward_only" data-line="37" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 37, true);">37</a>
4738
+ <a class="line-number" data-cell="forward_only" data-line="38" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 38, true);">38</a>
4739
+ <a class="line-number" data-cell="forward_only" data-line="39" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 39, true);">39</a>
4740
+ <a class="line-number" data-cell="forward_only" data-line="40" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 40, true);">40</a>
4741
+ <a class="line-number" data-cell="forward_only" data-line="41" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 41, true);">41</a>
4742
+ <a class="line-number" data-cell="forward_only" data-line="42" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 42, true);">42</a>
4743
+ <a class="line-number" data-cell="forward_only" data-line="43" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 43, true);">43</a>
4744
+ <a class="line-number" data-cell="forward_only" data-line="44" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 44, true);">44</a>
4745
+ <a class="line-number" data-cell="forward_only" data-line="45" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 45, true);">45</a>
4746
+ <a class="line-number" data-cell="forward_only" data-line="46" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 46, true);">46</a>
4747
+ <a class="line-number" data-cell="forward_only" data-line="47" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 47, true);">47</a>
4748
+ <a class="line-number" data-cell="forward_only" data-line="48" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 48, true);">48</a>
4749
+ <a class="line-number" data-cell="forward_only" data-line="49" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 49, true);">49</a>
4750
+ <a class="line-number" data-cell="forward_only" data-line="50" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 50, true);">50</a>
4751
+ <a class="line-number" data-cell="forward_only" data-line="51" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 51, true);">51</a>
4752
+ <a class="line-number" data-cell="forward_only" data-line="52" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 52, true);">52</a>
4753
+ <a class="line-number" data-cell="forward_only" data-line="53" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 53, true);">53</a>
4754
+ <a class="line-number" data-cell="forward_only" data-line="54" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 54, true);">54</a>
4755
+ <a class="line-number" data-cell="forward_only" data-line="55" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 55, true);">55</a>
4756
+ <a class="line-number" data-cell="forward_only" data-line="56" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 56, true);">56</a>
4757
+ <a class="line-number" data-cell="forward_only" data-line="57" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 57, true);">57</a>
4758
+ <a class="line-number" data-cell="forward_only" data-line="58" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 58, true);">58</a>
4759
+ <a class="line-number" data-cell="forward_only" data-line="59" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 59, true);">59</a>
4760
+ <a class="line-number" data-cell="forward_only" data-line="60" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 60, true);">60</a>
4761
+ <a class="line-number" data-cell="forward_only" data-line="61" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 61, true);">61</a>
4762
+ <a class="line-number" data-cell="forward_only" data-line="62" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 62, true);">62</a>
4763
+ <a class="line-number" data-cell="forward_only" data-line="63" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 63, true);">63</a>
4764
+ <a class="line-number" data-cell="forward_only" data-line="64" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 64, true);">64</a>
4765
+ <a class="line-number" data-cell="forward_only" data-line="65" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 65, true);">65</a>
4766
+ <a class="line-number" data-cell="forward_only" data-line="66" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 66, true);">66</a>
4767
+ <a class="line-number" data-cell="forward_only" data-line="67" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 67, true);">67</a>
4768
+ <a class="line-number" data-cell="forward_only" data-line="68" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 68, true);">68</a>
4769
+ <a class="line-number" data-cell="forward_only" data-line="69" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 69, true);">69</a>
4770
+ <a class="line-number" data-cell="forward_only" data-line="70" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 70, true);">70</a>
4771
+ <a class="line-number" data-cell="forward_only" data-line="71" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 71, true);">71</a>
4772
+ <a class="line-number" data-cell="forward_only" data-line="72" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 72, true);">72</a>
4773
+ <a class="line-number" data-cell="forward_only" data-line="73" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 73, true);">73</a>
4774
+ <a class="line-number" data-cell="forward_only" data-line="74" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 74, true);">74</a>
4775
+ <a class="line-number" data-cell="forward_only" data-line="75" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 75, true);">75</a>
4776
+ <a class="line-number" data-cell="forward_only" data-line="76" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 76, true);">76</a>
4777
+ <a class="line-number" data-cell="forward_only" data-line="77" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 77, true);">77</a>
4778
+ <a class="line-number" data-cell="forward_only" data-line="78" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 78, true);">78</a>
4779
+ <a class="line-number" data-cell="forward_only" data-line="79" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 79, true);">79</a>
4780
+ <a class="line-number" data-cell="forward_only" data-line="80" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 80, true);">80</a>
4781
+ <a class="line-number" data-cell="forward_only" data-line="81" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 81, true);">81</a>
4782
+ <a class="line-number" data-cell="forward_only" data-line="82" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 82, true);">82</a>
4783
+ <a class="line-number" data-cell="forward_only" data-line="83" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 83, true);">83</a>
4784
+ <a class="line-number" data-cell="forward_only" data-line="84" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 84, true);">84</a>
4785
+ <a class="line-number" data-cell="forward_only" data-line="85" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 85, true);">85</a>
4786
+ <a class="line-number" data-cell="forward_only" data-line="86" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 86, true);">86</a>
4787
+ <a class="line-number" data-cell="forward_only" data-line="87" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 87, true);">87</a>
4788
+ <a class="line-number" data-cell="forward_only" data-line="88" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 88, true);">88</a>
4789
+ <a class="line-number" data-cell="forward_only" data-line="89" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 89, true);">89</a>
4790
+ <a class="line-number" data-cell="forward_only" data-line="90" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 90, true);">90</a>
4791
+ <a class="line-number" data-cell="forward_only" data-line="91" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 91, true);">91</a>
4792
+ <a class="line-number" data-cell="forward_only" data-line="92" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 92, true);">92</a>
4793
+ <a class="line-number" data-cell="forward_only" data-line="93" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 93, true);">93</a>
4794
+ <a class="line-number" data-cell="forward_only" data-line="94" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 94, true);">94</a>
4795
+ <a class="line-number" data-cell="forward_only" data-line="95" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 95, true);">95</a>
4796
+ <a class="line-number" data-cell="forward_only" data-line="96" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 96, true);">96</a>
4797
+ <a class="line-number" data-cell="forward_only" data-line="97" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 97, true);">97</a>
4798
+ <a class="line-number" data-cell="forward_only" data-line="98" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 98, true);">98</a>
4799
+ <a class="line-number" data-cell="forward_only" data-line="99" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 99, true);">99</a>
4800
+ <a class="line-number" data-cell="forward_only" data-line="100" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 100, true);">100</a>
4801
+ <a class="line-number" data-cell="forward_only" data-line="101" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 101, true);">101</a>
4802
+ </div>
4803
+ <div class="code-wrap">
4804
+ <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
4805
+ <span class="c1"># requires-python = &quot;&gt;=3.12&quot;</span>
4806
+ <span class="c1"># dependencies = [</span>
4807
+ <span class="c1"># &quot;accelerate&gt;=1.10.1&quot;,</span>
4808
+ <span class="c1"># &quot;torch&gt;=2.7.0&quot;,</span>
4809
+ <span class="c1"># &quot;kernels==0.10.0&quot;,</span>
4810
+ <span class="c1"># &quot;transformers@https://github.com/huggingface/transformers.git&quot;,</span>
4811
+ <span class="c1"># &quot;ipdb&gt;=0.13.13&quot;,</span>
4812
+ <span class="c1"># &quot;matplotlib&gt;=3.7.2&quot;,</span>
4813
+ <span class="c1"># &quot;numpy&gt;=1.24.3&quot;,</span>
4814
+ <span class="c1"># ]</span>
4815
+ <span class="c1"># ///</span>
4816
+
4817
+ <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
4818
+ <span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssForCausalLM</span><span class="p">,</span> <span class="n">PreTrainedTokenizerFast</span><span class="p">,</span> <span class="n">Mxfp4Config</span>
4819
+ <span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
4820
+ <span class="kn">import</span><span class="w"> </span><span class="nn">torch.nn</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">nn</span>
4821
+ <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">register_kernel_mapping</span><span class="p">,</span> <span class="n">Mode</span><span class="p">,</span> <span class="n">LayerRepository</span><span class="p">,</span> <span class="n">replace_kernel_forward_from_hub</span>
4822
+ <span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
4823
+ <span class="kn">import</span><span class="w"> </span><span class="nn">torch.profiler</span>
4824
+ <span class="kn">import</span><span class="w"> </span><span class="nn">gc</span>
4825
+ <span class="kn">import</span><span class="w"> </span><span class="nn">logging</span>
4826
+ <span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssRMSNorm</span>
4827
+
4828
+
4829
+ <span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
4830
+
4831
+ <span class="c1"># set to debug logging</span>
4832
+ <span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">)</span>
4833
+
4834
+ <span class="k">def</span><span class="w"> </span><span class="nf">reset_peak_memory_stats</span><span class="p">():</span>
4835
+ <span class="w"> </span><span class="sd">&quot;&quot;&quot;Clear CUDA cache and reset memory allocation counters.&quot;&quot;&quot;</span>
4836
+ <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">empty_cache</span><span class="p">()</span>
4837
+ <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span>
4838
+ <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">reset_peak_memory_stats</span><span class="p">()</span>
4839
+ <span class="n">gc</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
4840
+
4841
+ <span class="k">def</span><span class="w"> </span><span class="nf">get_memory_stats</span><span class="p">():</span>
4842
+ <span class="w"> </span><span class="sd">&quot;&quot;&quot;Get current and peak CUDA memory usage.&quot;&quot;&quot;</span>
4843
+ <span class="k">if</span> <span class="ow">not</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span>
4844
+ <span class="k">return</span> <span class="p">{</span><span class="s2">&quot;allocated_gb&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">&quot;peak_gb&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">&quot;reserved_gb&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">}</span>
4845
+ <span class="k">return</span> <span class="p">{</span>
4846
+ <span class="s2">&quot;allocated_gb&quot;</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">memory_allocated</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
4847
+ <span class="s2">&quot;peak_gb&quot;</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">max_memory_allocated</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
4848
+ <span class="s2">&quot;reserved_gb&quot;</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">memory_reserved</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
4849
+ <span class="p">}</span>
4850
+
4851
+ <span class="k">def</span><span class="w"> </span><span class="nf">override_kernel_layer_name</span><span class="p">(</span><span class="n">cls_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
4852
+ <span class="w"> </span><span class="sd">&quot;&quot;&quot;Helper to dynamically override the kernel_layer_name in a model class.&quot;&quot;&quot;</span>
4853
+ <span class="k">for</span> <span class="n">mod</span> <span class="ow">in</span> <span class="n">sys</span><span class="o">.</span><span class="n">modules</span><span class="o">.</span><span class="n">values</span><span class="p">():</span>
4854
+ <span class="k">if</span> <span class="n">mod</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
4855
+ <span class="k">continue</span>
4856
+ <span class="n">obj</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">cls_name</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
4857
+ <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="nb">type</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">issubclass</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
4858
+ <span class="nb">setattr</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="s2">&quot;kernel_layer_name&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
4859
+ <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Overrode </span><span class="si">{</span><span class="n">cls_name</span><span class="si">}</span><span class="s2">.kernel_layer_name to </span><span class="si">{</span><span class="n">value</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
4860
+ <span class="k">return</span> <span class="kc">True</span>
4861
+ <span class="k">return</span> <span class="kc">False</span>
4862
+
4863
+
4864
+ <span class="c1"># Init the model the normal way</span>
4865
+ <span class="n">model_id</span> <span class="o">=</span> <span class="s2">&quot;openai/gpt-oss-20b&quot;</span>
4866
+ <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">PreTrainedTokenizerFast</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_id</span><span class="p">)</span>
4867
+ <span class="n">quantization_config</span> <span class="o">=</span> <span class="n">Mxfp4Config</span><span class="p">(</span><span class="n">dequantize</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
4868
+
4869
+
4870
+
4871
+ <span class="n">model</span> <span class="o">=</span> <span class="n">GptOssForCausalLM</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
4872
+ <span class="n">model_id</span><span class="p">,</span>
4873
+ <span class="n">dtype</span><span class="o">=</span><span class="s2">&quot;bfloat16&quot;</span><span class="p">,</span>
4874
+ <span class="n">device_map</span><span class="o">=</span><span class="s2">&quot;auto&quot;</span><span class="p">,</span>
4875
+ <span class="n">use_kernels</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
4876
+ <span class="n">quantization_config</span><span class="o">=</span><span class="n">quantization_config</span><span class="p">,</span>
4877
+ <span class="p">)</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
4878
+
4879
+ <span class="n">messages</span> <span class="o">=</span> <span class="p">[</span>
4880
+ <span class="p">{</span><span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="s2">&quot;system&quot;</span><span class="p">,</span> <span class="s2">&quot;content&quot;</span><span class="p">:</span> <span class="s2">&quot;What is Tensor Parallelism?&quot;</span><span class="p">},</span>
4881
+ <span class="p">]</span>
4882
+
4883
+ <span class="n">inputs</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">apply_chat_template</span><span class="p">(</span>
4884
+ <span class="n">messages</span><span class="p">,</span>
4885
+ <span class="n">add_generation_prompt</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
4886
+ <span class="n">return_tensors</span><span class="o">=</span><span class="s2">&quot;pt&quot;</span><span class="p">,</span>
4887
+ <span class="n">return_dict</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
4888
+ <span class="n">reasoning_effort</span><span class="o">=</span><span class="s2">&quot;low&quot;</span><span class="p">,</span>
4889
+ <span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
4890
+
4891
+ <span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">256</span>
4892
+
4893
+ <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
4894
+ <span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
4895
+ <span class="n">generated</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
4896
+ <span class="o">**</span><span class="n">inputs</span><span class="p">,</span>
4897
+ <span class="n">max_new_tokens</span><span class="o">=</span><span class="n">max_tokens</span><span class="p">,</span>
4898
+ <span class="n">do_sample</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
4899
+ <span class="n">temperature</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
4900
+ <span class="p">)</span>
4901
+ <span class="n">end_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
4902
+
4903
+ <span class="nb">print</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">generated</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">skip_special_tokens</span><span class="o">=</span><span class="kc">False</span><span class="p">))</span>
4904
+ <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds&quot;</span><span class="p">)</span>
4905
+ </pre></div>
4906
+
4907
+ <div class="code-line-highlight" id="line-highlight-forward_only"></div>
4908
+ </div>
4909
+ </div>
4910
+ </div>
4911
+ <div id="output-forward_only" class="cell-output">
4912
+ <div class="cell-stdout">&lt;|start|&gt;system&lt;|message|&gt;You are ChatGPT, a large language model trained by OpenAI.
4913
+ Knowledge cutoff: 2024-06
4914
+ Current date: 2025-09-24
4915
+
4916
+ Reasoning: low
4917
+
4918
+ # Valid channels: analysis, commentary, final. Channel must be included for every message.&lt;|end|&gt;&lt;|start|&gt;developer&lt;|message|&gt;# Instructions
4919
+
4920
+ What is Tensor Parallelism?
4921
+
4922
+ &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it&#x27;s used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it&#x27;s also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of &quot;tensor model parallelism&quot; vs &quot;tensor parallelism&quot; synonyms. Provide mention of &quot;tensor parallelism&quot; in Megatron-LM: splitting weight matrices across GPUs. Provide mention of &quot;tensor parallelism&quot; in DeepSpeed: &quot;ZeRO-Offload&quot; etc. Provide mention
4923
+ Generation took 31.31 seconds
4924
+ </div>
4925
+ <div class="uv-install-logs" id="uv-logs-forward_only">
4926
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4927
+ <div class="uv-logs-content" style="display: none;">
4928
+ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
4929
+ Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4930
+ Updating https://github.com/huggingface/transformers.git (HEAD)
4931
+ Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4932
+ Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4933
+ Downloading pygments (1.2MiB)
4934
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4935
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4936
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4937
+ Downloading nvidia-curand-cu12 (60.7MiB)
4938
+ Downloading hf-xet (3.0MiB)
4939
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4940
+ Downloading numpy (15.9MiB)
4941
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4942
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4943
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4944
+ Downloading pillow (6.3MiB)
4945
+ Downloading networkx (1.9MiB)
4946
+ Downloading sympy (6.0MiB)
4947
+ Downloading tokenizers (3.1MiB)
4948
+ Downloading jedi (1.5MiB)
4949
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4950
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4951
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4952
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4953
+ Downloading fonttools (4.7MiB)
4954
+ Downloading torch (846.8MiB)
4955
+ Downloading matplotlib (8.3MiB)
4956
+ Downloading kiwisolver (1.4MiB)
4957
+ Downloading triton (148.4MiB)
4958
+ Downloading nvidia-cufile-cu12
4959
+ Downloading kiwisolver
4960
+ Downloading pygments
4961
+ Downloading hf-xet
4962
+ Downloading tokenizers
4963
+ Downloading networkx
4964
+ Downloading fonttools
4965
+ Downloading pillow
4966
+ Downloading matplotlib
4967
+ Downloading nvidia-cuda-cupti-cu12
4968
+ Downloading numpy
4969
+ Downloading sympy
4970
+ Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4971
+ Downloading nvidia-nvjitlink-cu12
4972
+ Downloading jedi
4973
+ Downloading nvidia-curand-cu12
4974
+ Downloading nvidia-cuda-nvrtc-cu12
4975
+ Downloading triton
4976
+ Downloading nvidia-cufft-cu12
4977
+ Downloading nvidia-cusolver-cu12
4978
+ Downloading nvidia-cusparse-cu12
4979
+ Downloading nvidia-cusparselt-cu12
4980
+ Downloading nvidia-nccl-cu12
4981
+ Downloading nvidia-cublas-cu12
4982
+ Downloading nvidia-cudnn-cu12
4983
+ Downloading torch
4984
+ Installed 69 packages in 454ms
4985
+ </div>
4986
+ </div>
4987
+ <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4988
+ Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:14, 7.39s/it]
4989
+ Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.78s/it]
4990
+ Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.88s/it]
4991
+ You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4992
+
4993
+ Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4994
+ Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.34s/it]
4995
+ Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4996
+ Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4997
+ Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
4998
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4999
+
5000
+ Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
5001
+ Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:10, 6.01it/s]
5002
+ Fetching 66 files: 14%|█▎ | 9/66 [00:00&lt;00:01, 31.85it/s]
5003
+ Fetching 66 files: 20%|█▉ | 13/66 [00:00&lt;00:02, 24.06it/s]
5004
+ Fetching 66 files: 26%|██▌ | 17/66 [00:01&lt;00:03, 12.48it/s]
5005
+ Fetching 66 files: 74%|███████▍ | 49/66 [00:01&lt;00:00, 53.80it/s]
5006
+ Fetching 66 files: 91%|█████████ | 60/66 [00:01&lt;00:00, 57.68it/s]
5007
+ Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 40.58it/s]
5008
+ /tmp/uvnote-run-_tyh_wp6/home/.cache/uv/environments-v2/forward-only-504a4941eac030a5/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5009
+ No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5010
+ warnings.warn(
5011
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5012
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5013
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5014
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5015
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5016
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5017
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5018
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5019
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5020
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5021
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5022
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5023
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5024
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5025
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5026
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5027
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5028
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5029
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5030
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5031
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5032
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5033
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5034
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5035
+ /tmp/uvnote-run-_tyh_wp6/home/.cache/uv/environments-v2/forward-only-504a4941eac030a5/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5036
+ No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5037
+ warnings.warn(
5038
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5039
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5040
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5041
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5042
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5043
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5044
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5045
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5046
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5047
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5048
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5049
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5050
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5051
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5052
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5053
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5054
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5055
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5056
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5057
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5058
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5059
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5060
+ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`</div>
5061
+ </div>
5062
+ </div>
5063
+
5064
  <h2>Forward and Backward</h2>
5065
  <p>Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.</p>
5066
  <div class="cell" id="cell-forward_and_backward">
 
5070
  <span onclick="toggleOutput('forward_and_backward')" style="cursor: pointer;">▼ output</span>
5071
  <span id="uv-indicator-forward_and_backward" onclick="toggleUvLogsFromHeader('forward_and_backward')" style="cursor: pointer;">▶ uv-logs</span>
5072
  </span> |
5073
+ Cell: forward_and_backward | 104.79s
5074
  | <button class="run-btn" onclick="runCell('forward_and_backward')">▶ run</button>
5075
  <button class="copy-btn" onclick="copyCell('forward_and_backward')">Copy</button>
5076
  <a href="cells/forward_and_backward.py" target="_blank" class="raw-btn">Raw</a>
 
5490
  What is Tensor Parallelism?
5491
 
5492
  &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it&#x27;s used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it&#x27;s
5493
+ Generation took 17.98 seconds
5494
  Post-generation memory: {&#x27;allocated_gb&#x27;: 9.398670336, &#x27;peak_gb&#x27;: 9.67278848, &#x27;reserved_gb&#x27;: 17.188257792}
5495
  Enabled gradient checkpointing
5496
  Post-forward memory: {&#x27;allocated_gb&#x27;: 9.487933952, &#x27;peak_gb&#x27;: 9.67278848, &#x27;reserved_gb&#x27;: 17.188257792}
 
5521
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
5522
  Updating https://github.com/huggingface/transformers.git (HEAD)
5523
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
5524
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5525
+ Downloading numpy (15.9MiB)
5526
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5527
+ Downloading hf-xet (3.0MiB)
5528
+ Downloading sympy (6.0MiB)
5529
+ Downloading jedi (1.5MiB)
5530
+ Downloading pygments (1.2MiB)
5531
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
5532
+ Downloading fonttools (4.7MiB)
5533
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
5534
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5535
  Downloading networkx (1.9MiB)
 
 
5536
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
5537
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
 
 
5538
  Downloading pillow (6.3MiB)
5539
+ Downloading nvidia-cufft-cu12 (184.2MiB)
5540
+ Downloading matplotlib (8.3MiB)
5541
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
5542
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
5543
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
 
 
5544
  Downloading kiwisolver (1.4MiB)
5545
+ Downloading torch (846.8MiB)
5546
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
5547
+ Downloading nvidia-curand-cu12 (60.7MiB)
5548
+ Downloading tokenizers (3.1MiB)
5549
  Downloading triton (148.4MiB)
5550
  Downloading nvidia-cufile-cu12
5551
  Downloading kiwisolver
5552
  Downloading pygments
 
5553
  Downloading hf-xet
5554
  Downloading tokenizers
5555
+ Downloading networkx
5556
  Downloading fonttools
 
5557
  Downloading pillow
5558
  Downloading matplotlib
5559
  Downloading nvidia-cuda-cupti-cu12
5560
  Downloading numpy
5561
+ Downloading sympy
5562
  Downloading nvidia-nvjitlink-cu12
5563
+ Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
5564
+ Downloading jedi
5565
  Downloading nvidia-curand-cu12
5566
  Downloading nvidia-cuda-nvrtc-cu12
5567
  Downloading triton
5568
  Downloading nvidia-cufft-cu12
5569
  Downloading nvidia-cusolver-cu12
 
5570
  Downloading nvidia-cusparse-cu12
5571
+ Downloading nvidia-cusparselt-cu12
5572
  Downloading nvidia-nccl-cu12
5573
  Downloading nvidia-cublas-cu12
5574
  Downloading nvidia-cudnn-cu12
5575
  Downloading torch
5576
+ Installed 69 packages in 506ms
5577
  </div>
5578
  </div>
5579
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
5580
+ Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:15, 7.79s/it]
5581
+ Fetching 3 files: 67%|██████▋ | 2/3 [00:09&lt;00:04, 4.50s/it]
5582
+ Fetching 3 files: 100%|██████████| 3/3 [00:09&lt;00:00, 3.33s/it]
5583
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
5584
 
5585
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
5586
+ Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.36s/it]
5587
+ Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
5588
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
5589
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
5590
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5591
 
5592
  Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
5593
+ Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:13, 4.68it/s]
5594
+ Fetching 66 files: 14%|█▎ | 9/66 [00:00&lt;00:02, 26.64it/s]
5595
+ Fetching 66 files: 21%|██ | 14/66 [00:00&lt;00:01, 33.33it/s]
5596
+ Fetching 66 files: 27%|██▋ | 18/66 [00:00&lt;00:02, 17.59it/s]
5597
+ Fetching 66 files: 53%|█████▎ | 35/66 [00:01&lt;00:00, 43.25it/s]
5598
+ Fetching 66 files: 64%|██████▎ | 42/66 [00:01&lt;00:00, 43.72it/s]
5599
+ Fetching 66 files: 74%|███████▍ | 49/66 [00:01&lt;00:00, 40.60it/s]
5600
+ Fetching 66 files: 85%|████████▍ | 56/66 [00:01&lt;00:00, 42.33it/s]
5601
+ Fetching 66 files: 95%|█████████▌| 63/66 [00:01&lt;00:00, 38.03it/s]
5602
+ Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 36.73it/s]
5603
+ /tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5604
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5605
  warnings.warn(
5606
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
 
5627
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5628
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5629
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5630
+ /tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5631
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5632
  warnings.warn(
5633
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
 
5654
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5655
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5656
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5657
+ /tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5658
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5659
  warnings.warn(
5660
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
 
5682
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5683
  `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
5684
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
5685
+ /tmp/uvnote-run-n1rg0p87/home/.cache/uv/environments-v2/forward-and-backward-422cb4863433d14c/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
5686
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
5687
  warnings.warn(
5688
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
megablocks_yamoe/artifacts/binned_run/binned_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 36.478538259971174,
13
- "min_ms": 33.54985800024224,
14
- "max_ms": 39.617000999896845,
15
- "std_ms": 1.5870638955537886,
16
- "p50_ms": 36.43554149994088,
17
- "p95_ms": 39.16828469987195,
18
- "p99_ms": 39.47986176004633,
19
  "num_iters": 50,
20
- "tokens_per_s": 2741.3379145658514,
21
- "throughput_variance": 118.95302366172646
22
  },
23
  "output_sum": 3.97190523147583
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 35.832872119995045,
13
+ "min_ms": 32.58174399991276,
14
+ "max_ms": 40.50060700001268,
15
+ "std_ms": 1.694341573523051,
16
+ "p50_ms": 36.17695449997882,
17
+ "p95_ms": 38.67062735003515,
18
+ "p99_ms": 39.92923416996405,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2790.733594145783,
21
+ "throughput_variance": 131.29596945634063
22
  },
23
  "output_sum": 3.97190523147583
24
  }
megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 45.01105025997276,
13
- "min_ms": 39.02894699967874,
14
- "max_ms": 49.29527800004507,
15
- "std_ms": 2.979711623110132,
16
- "p50_ms": 45.6719464998514,
17
- "p95_ms": 48.48902935004844,
18
- "p99_ms": 49.0557057300839,
19
  "num_iters": 50,
20
- "tokens_per_s": 2221.6766643396363,
21
- "throughput_variance": 151.30753386326467
22
  },
23
  "output_sum": 11.53223705291748
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 46.790802699997585,
13
+ "min_ms": 39.03555299996242,
14
+ "max_ms": 50.85692799991648,
15
+ "std_ms": 3.250858562771192,
16
+ "p50_ms": 47.475618500016026,
17
+ "p95_ms": 50.805645549957035,
18
+ "p99_ms": 50.83896361993766,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2137.172141310693,
21
+ "throughput_variance": 155.17201487457513
22
  },
23
  "output_sum": 11.53223705291748
24
  }
megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 44.678819580012714,
13
- "min_ms": 38.108840999939275,
14
- "max_ms": 49.00846700002148,
15
- "std_ms": 2.8989978625015023,
16
- "p50_ms": 45.39998149971325,
17
- "p95_ms": 48.408032500015,
18
- "p99_ms": 48.790303320047315,
19
  "num_iters": 50,
20
- "tokens_per_s": 2238.197001174478,
21
- "throughput_variance": 150.30214966250284
22
  },
23
  "output_sum": 11.53223705291748
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 45.006849599990346,
13
+ "min_ms": 38.83674200005771,
14
+ "max_ms": 49.30821800007834,
15
+ "std_ms": 2.893955494967115,
16
+ "p50_ms": 45.57549300000119,
17
+ "p95_ms": 48.57250854988706,
18
+ "p99_ms": 48.963614720073565,
19
  "num_iters": 50,
20
+ "tokens_per_s": 2221.8840218494533,
21
+ "throughput_variance": 147.8630259637854
22
  },
23
  "output_sum": 11.53223705291748
24
  }
megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json CHANGED
@@ -9,16 +9,16 @@
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
- "avg_ms": 4.249353080003857,
13
- "min_ms": 4.130692999751773,
14
- "max_ms": 4.30493799967735,
15
- "std_ms": 0.027547357116313485,
16
- "p50_ms": 4.2500710001149855,
17
- "p95_ms": 4.289417000063622,
18
- "p99_ms": 4.299768499754464,
19
  "num_iters": 50,
20
- "tokens_per_s": 23532.993873954394,
21
- "throughput_variance": 154.3815152476545
22
  },
23
- "output_sum": 3.971905469894409
24
  }
 
9
  "vary_inputs": true
10
  },
11
  "stats": {
12
+ "avg_ms": 4.2496077999999216,
13
+ "min_ms": 4.143714000065302,
14
+ "max_ms": 4.276272000083736,
15
+ "std_ms": 0.02026809704303406,
16
+ "p50_ms": 4.251974999931463,
17
+ "p95_ms": 4.269103000035557,
18
+ "p99_ms": 4.276041210073345,
19
  "num_iters": 50,
20
+ "tokens_per_s": 23531.58331458302,
21
+ "throughput_variance": 113.86151920477748
22
  },
23
+ "output_sum": 3.97190523147583
24
  }
megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc CHANGED
Binary files a/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
 
megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc CHANGED
Binary files a/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
 
megablocks_yamoe/cells/megablocks_run.py CHANGED
@@ -56,7 +56,7 @@ def build_megablocks_model(device: torch.device):
56
  # Attach loaded expert weights to the experts container
57
  e = model.experts
58
  e.alpha = 1.702
59
- e.capacity_factor = 128
60
  e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
61
  e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
62
  e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
 
56
  # Attach loaded expert weights to the experts container
57
  e = model.experts
58
  e.alpha = 1.702
59
+ e.capacity_factor = 64
60
  e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
61
  e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
62
  e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
megablocks_yamoe/megablocks_yamoe.html CHANGED
@@ -3722,7 +3722,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3722
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3723
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3724
  </span> |
3725
- Cell: nv | 0.54s
3726
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3727
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3728
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3745,7 +3745,7 @@ Cell: nv | 0.54s
3745
  </div>
3746
  </div>
3747
  <div id="output-nv" class="cell-output">
3748
- <div class="cell-stdout">Wed Sep 24 20:10:54 2025
3749
  +-----------------------------------------------------------------------------------------+
3750
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3751
  |-----------------------------------------+------------------------+----------------------+
@@ -3754,19 +3754,19 @@ Cell: nv | 0.54s
3754
  | | | MIG M. |
3755
  |=========================================+========================+======================|
3756
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3757
- | 0% 35C P0 42W / 300W | 0MiB / 23028MiB | 0% Default |
3758
  | | | N/A |
3759
  +-----------------------------------------+------------------------+----------------------+
3760
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3761
- | 0% 35C P0 44W / 300W | 0MiB / 23028MiB | 0% Default |
3762
  | | | N/A |
3763
  +-----------------------------------------+------------------------+----------------------+
3764
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3765
- | 0% 35C P0 41W / 300W | 0MiB / 23028MiB | 0% Default |
3766
  | | | N/A |
3767
  +-----------------------------------------+------------------------+----------------------+
3768
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3769
- | 0% 34C P0 46W / 300W | 0MiB / 23028MiB | 0% Default |
3770
  | | | N/A |
3771
  +-----------------------------------------+------------------------+----------------------+
3772
 
@@ -3792,7 +3792,7 @@ Cell: nv | 0.54s
3792
  <span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
3793
  <span id="uv-indicator-setup2" onclick="toggleUvLogsFromHeader('setup2')" style="cursor: pointer;">▶ uv-logs</span>
3794
  </span> |
3795
- Cell: setup2 | 113.94s
3796
  | <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
3797
  <button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
3798
  <a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
@@ -4050,7 +4050,7 @@ Reasoning: low
4050
  What is Tensor Parallelism?
4051
 
4052
  &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it&#x27;s used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it&#x27;s also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of &quot;tensor model parallelism&quot; vs &quot;tensor parallelism&quot; synonyms. Provide mention of &quot;tensor parallelism&quot; in Megatron-LM: splitting weight matrices across GPUs. Provide mention of &quot;tensor parallelism&quot; in DeepSpeed: &quot;ZeRO-Offload&quot; etc. Provide mention
4053
- Generation took 31.30 seconds
4054
  </div>
4055
  <div class="uv-install-logs" id="uv-logs-setup2">
4056
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
@@ -4059,32 +4059,32 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
4059
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4060
  Updating https://github.com/huggingface/transformers.git (HEAD)
4061
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4062
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4063
- Downloading jedi (1.5MiB)
4064
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4065
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4066
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4067
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4068
  Downloading hf-xet (3.0MiB)
4069
- Downloading numpy (15.9MiB)
 
 
 
 
 
4070
  Downloading nvidia-cublas-cu12 (566.8MiB)
4071
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
 
 
4072
  Downloading nvidia-curand-cu12 (60.7MiB)
 
 
 
4073
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4074
  Downloading nvidia-cufft-cu12 (184.2MiB)
4075
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4076
  Downloading matplotlib (8.3MiB)
4077
- Downloading pygments (1.2MiB)
4078
- Downloading networkx (1.9MiB)
4079
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4080
- Downloading tokenizers (3.1MiB)
4081
- Downloading pillow (6.3MiB)
4082
- Downloading torch (846.8MiB)
4083
- Downloading sympy (6.0MiB)
4084
- Downloading nvidia-nccl-cu12 (307.4MiB)
4085
  Downloading fonttools (4.7MiB)
 
4086
  Downloading kiwisolver (1.4MiB)
4087
- Downloading triton (148.4MiB)
4088
  Downloading nvidia-cufile-cu12
4089
  Downloading kiwisolver
4090
  Downloading pygments
@@ -4105,38 +4105,38 @@ Downloading triton (148.4MiB)
4105
  Downloading triton
4106
  Downloading nvidia-cufft-cu12
4107
  Downloading nvidia-cusolver-cu12
4108
- Downloading nvidia-cusparselt-cu12
4109
  Downloading nvidia-cusparse-cu12
 
4110
  Downloading nvidia-nccl-cu12
4111
  Downloading nvidia-cublas-cu12
4112
  Downloading nvidia-cudnn-cu12
4113
  Downloading torch
4114
- Installed 69 packages in 578ms
4115
  </div>
4116
  </div>
4117
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4118
- Fetching 3 files: 33%|███▎ | 1/3 [00:06&lt;00:13, 6.51s/it]
4119
- Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.82s/it]
4120
- Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.82s/it]
4121
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4122
 
4123
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4124
- Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.35s/it]
4125
  Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4126
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4127
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
4128
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4129
 
4130
  Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
4131
- Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:09, 6.59it/s]
4132
- Fetching 66 files: 9%|▉ | 6/66 [00:00&lt;00:02, 27.12it/s]
4133
- Fetching 66 files: 20%|█▉ | 13/66 [00:00&lt;00:01, 31.76it/s]
4134
- Fetching 66 files: 26%|██▌ | 17/66 [00:00&lt;00:03, 15.12it/s]
4135
- Fetching 66 files: 58%|█████▊ | 38/66 [00:01&lt;00:00, 37.34it/s]
4136
- Fetching 66 files: 67%|██████▋ | 44/66 [00:01&lt;00:00, 36.51it/s]
4137
- Fetching 66 files: 83%|████████▎ | 55/66 [00:01&lt;00:00, 42.96it/s]
4138
- Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 39.99it/s]
4139
- /tmp/uvnote-run-it1i5axp/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4140
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4141
  warnings.warn(
4142
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
@@ -4163,7 +4163,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
4163
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4164
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4165
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4166
- /tmp/uvnote-run-it1i5axp/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4167
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4168
  warnings.warn(
4169
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
@@ -4200,7 +4200,7 @@ INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks
4200
  <span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
4201
  <span id="uv-indicator-setup" onclick="toggleUvLogsFromHeader('setup')" style="cursor: pointer;">▶ uv-logs</span>
4202
  </span> |
4203
- Cell: setup | 107.76s
4204
  | <button class="run-btn" onclick="runCell('setup')">▶ run</button>
4205
  <button class="copy-btn" onclick="copyCell('setup')">Copy</button>
4206
  <a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
@@ -4459,8 +4459,12 @@ Reasoning: low
4459
 
4460
  What is Tensor Parallelism?
4461
 
4462
- &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are aggregated. It&#x27;s used to scale up models beyond single device memory. Also mention pipeline parallelism, data parallelism. Provide details: e.g., for a linear layer weight matrix W of shape (out_features, in_features), we can split along out_features dimension across devices. Each device computes its part of the output. Then gather. Similarly for attention QKV projections. Provide example: GPT-3 uses tensor parallelism. Also mention frameworks: Megatron-LM, DeepSpeed, etc. Provide pros/cons. Provide code snippet? Provide explanation of communication overhead. Provide mention of &quot;tensor model parallelism&quot; vs &quot;tensor parallelism&quot; synonyms. Provide mention of &quot;tensor parallelism&quot; in context of &quot;DeepSpeed ZeRO Stage 3&quot; or &quot;Megatron-LM&quot;. Provide mention of &quot;tensor parallelism&quot; as part of &quot;model parallelism&quot; in large language models. Provide mention of &quot;tensor parallelism&quot; as &quot;splitting weight
4463
- Generation took 26.29 seconds
 
 
 
 
4464
  </div>
4465
  <div class="uv-install-logs" id="uv-logs-setup">
4466
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
@@ -4469,37 +4473,37 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
4469
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4470
  Updating https://github.com/huggingface/transformers.git (HEAD)
4471
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4472
- Downloading networkx (1.9MiB)
 
4473
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4474
- Downloading kiwisolver (1.4MiB)
4475
- Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
4476
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4477
- Downloading jedi (1.5MiB)
 
 
 
 
 
4478
  Downloading pillow (6.3MiB)
4479
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4480
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4481
- Downloading numpy (15.9MiB)
4482
  Downloading fonttools (4.7MiB)
4483
- Downloading tokenizers (3.1MiB)
4484
- Downloading torch (846.8MiB)
4485
  Downloading hf-xet (3.0MiB)
 
 
 
 
 
4486
  Downloading nvidia-cublas-cu12 (566.8MiB)
4487
- Downloading nvidia-cufft-cu12 (184.2MiB)
4488
- Downloading nvidia-curand-cu12 (60.7MiB)
4489
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4490
  Downloading triton (148.4MiB)
4491
- Downloading nvidia-cufile-cu12 (1.1MiB)
4492
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4493
- Downloading pygments (1.2MiB)
4494
- Downloading matplotlib (8.3MiB)
4495
- Downloading sympy (6.0MiB)
4496
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4497
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4498
  Downloading nvidia-cufile-cu12
4499
  Downloading kiwisolver
4500
  Downloading pygments
4501
- Downloading hf-xet
4502
  Downloading tokenizers
 
4503
  Downloading networkx
4504
  Downloading fonttools
4505
  Downloading pillow
@@ -4515,33 +4519,33 @@ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4515
  Downloading triton
4516
  Downloading nvidia-cufft-cu12
4517
  Downloading nvidia-cusolver-cu12
4518
- Downloading nvidia-cusparselt-cu12
4519
  Downloading nvidia-cusparse-cu12
 
4520
  Downloading nvidia-nccl-cu12
4521
  Downloading nvidia-cublas-cu12
4522
  Downloading nvidia-cudnn-cu12
4523
  Downloading torch
4524
- Installed 69 packages in 471ms
4525
  </div>
4526
  </div>
4527
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4528
- Fetching 3 files: 33%|███▎ | 1/3 [00:06&lt;00:13, 6.98s/it]
4529
- Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.66s/it]
4530
- Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.78s/it]
4531
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4532
 
4533
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4534
- Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.34s/it]
4535
- Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4536
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4537
- Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
4538
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4539
 
4540
  Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
4541
- Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:01, 4.94it/s]
4542
- Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 8.51it/s]
4543
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 14.89it/s]
4544
- /tmp/uvnote-run-_yjosja7/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4545
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4546
  warnings.warn(
4547
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
@@ -4568,7 +4572,7 @@ INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for laye
4568
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4569
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4570
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4571
- /tmp/uvnote-run-_yjosja7/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4572
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4573
  warnings.warn(
4574
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
 
3722
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3723
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3724
  </span> |
3725
+ Cell: nv | 0.53s
3726
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3727
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3728
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3745
  </div>
3746
  </div>
3747
  <div id="output-nv" class="cell-output">
3748
+ <div class="cell-stdout">Wed Sep 24 21:05:30 2025
3749
  +-----------------------------------------------------------------------------------------+
3750
  | NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
3751
  |-----------------------------------------+------------------------+----------------------+
 
3754
  | | | MIG M. |
3755
  |=========================================+========================+======================|
3756
  | 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
3757
+ | 0% 38C P0 46W / 300W | 0MiB / 23028MiB | 0% Default |
3758
  | | | N/A |
3759
  +-----------------------------------------+------------------------+----------------------+
3760
  | 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
3761
+ | 0% 37C P0 45W / 300W | 0MiB / 23028MiB | 0% Default |
3762
  | | | N/A |
3763
  +-----------------------------------------+------------------------+----------------------+
3764
  | 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
3765
+ | 0% 39C P0 47W / 300W | 0MiB / 23028MiB | 0% Default |
3766
  | | | N/A |
3767
  +-----------------------------------------+------------------------+----------------------+
3768
  | 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
3769
+ | 0% 38C P0 46W / 300W | 0MiB / 23028MiB | 0% Default |
3770
  | | | N/A |
3771
  +-----------------------------------------+------------------------+----------------------+
3772
 
 
3792
  <span onclick="toggleOutput('setup2')" style="cursor: pointer;">▼ output</span>
3793
  <span id="uv-indicator-setup2" onclick="toggleUvLogsFromHeader('setup2')" style="cursor: pointer;">▶ uv-logs</span>
3794
  </span> |
3795
+ Cell: setup2 | 113.64s
3796
  | <button class="run-btn" onclick="runCell('setup2')">▶ run</button>
3797
  <button class="copy-btn" onclick="copyCell('setup2')">Copy</button>
3798
  <a href="cells/setup2.py" target="_blank" class="raw-btn">Raw</a>
 
4050
  What is Tensor Parallelism?
4051
 
4052
  &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of communication overhead, scaling, etc. Also mention that it&#x27;s used in large models like GPT-3, Megatron-LM, DeepSpeed. Provide references. Also mention that it&#x27;s also called model parallelism. Provide explanation of how it works in practice: e.g., for a linear layer, weight matrix W of shape (out_features, in_features). In tensor parallelism, split W along out_features dimension across GPUs. Each GPU computes partial output. Then gather outputs. Provide details on how to handle bias, etc. Provide mention of &quot;tensor model parallelism&quot; vs &quot;tensor parallelism&quot; synonyms. Provide mention of &quot;tensor parallelism&quot; in Megatron-LM: splitting weight matrices across GPUs. Provide mention of &quot;tensor parallelism&quot; in DeepSpeed: &quot;ZeRO-Offload&quot; etc. Provide mention
4053
+ Generation took 31.35 seconds
4054
  </div>
4055
  <div class="uv-install-logs" id="uv-logs-setup2">
4056
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 
4059
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4060
  Updating https://github.com/huggingface/transformers.git (HEAD)
4061
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4062
+ Downloading sympy (6.0MiB)
 
4063
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
 
 
 
4064
  Downloading hf-xet (3.0MiB)
4065
+ Downloading pillow (6.3MiB)
4066
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4067
+ Downloading networkx (1.9MiB)
4068
+ Downloading pygments (1.2MiB)
4069
+ Downloading tokenizers (3.1MiB)
4070
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4071
  Downloading nvidia-cublas-cu12 (566.8MiB)
4072
  Downloading nvidia-cufile-cu12 (1.1MiB)
4073
+ Downloading jedi (1.5MiB)
4074
+ Downloading numpy (15.9MiB)
4075
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4076
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4077
  Downloading nvidia-curand-cu12 (60.7MiB)
4078
+ Downloading triton (148.4MiB)
4079
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4080
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4081
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4082
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
4083
  Downloading matplotlib (8.3MiB)
 
 
 
 
 
 
 
 
4084
  Downloading fonttools (4.7MiB)
4085
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4086
  Downloading kiwisolver (1.4MiB)
4087
+ Downloading torch (846.8MiB)
4088
  Downloading nvidia-cufile-cu12
4089
  Downloading kiwisolver
4090
  Downloading pygments
 
4105
  Downloading triton
4106
  Downloading nvidia-cufft-cu12
4107
  Downloading nvidia-cusolver-cu12
 
4108
  Downloading nvidia-cusparse-cu12
4109
+ Downloading nvidia-cusparselt-cu12
4110
  Downloading nvidia-nccl-cu12
4111
  Downloading nvidia-cublas-cu12
4112
  Downloading nvidia-cudnn-cu12
4113
  Downloading torch
4114
+ Installed 69 packages in 550ms
4115
  </div>
4116
  </div>
4117
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4118
+ Fetching 3 files: 33%|███▎ | 1/3 [00:06&lt;00:12, 6.47s/it]
4119
+ Fetching 3 files: 67%|██████▋ | 2/3 [00:07&lt;00:03, 3.37s/it]
4120
+ Fetching 3 files: 100%|██████████| 3/3 [00:07&lt;00:00, 2.56s/it]
4121
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4122
 
4123
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4124
+ Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.34s/it]
4125
  Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.25s/it]
4126
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4127
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.93s/it]
4128
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4129
 
4130
  Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
4131
+ Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:16, 3.87it/s]
4132
+ Fetching 66 files: 14%|█▎ | 9/66 [00:00&lt;00:03, 18.15it/s]
4133
+ Fetching 66 files: 26%|██▌ | 17/66 [00:00&lt;00:02, 24.03it/s]
4134
+ Fetching 66 files: 56%|█████▌ | 37/66 [00:00&lt;00:00, 58.06it/s]
4135
+ Fetching 66 files: 71%|███████ | 47/66 [00:01&lt;00:00, 37.14it/s]
4136
+ Fetching 66 files: 85%|████████▍ | 56/66 [00:01&lt;00:00, 39.66it/s]
4137
+ Fetching 66 files: 98%|█████████▊| 65/66 [00:01&lt;00:00, 42.21it/s]
4138
+ Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 37.62it/s]
4139
+ /tmp/uvnote-run-e6cle3et/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4140
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4141
  warnings.warn(
4142
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
 
4163
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4164
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4165
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
4166
+ /tmp/uvnote-run-e6cle3et/home/.cache/uv/environments-v2/setup2-adf2810b697d7b08/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4167
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4168
  warnings.warn(
4169
  INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
 
4200
  <span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
4201
  <span id="uv-indicator-setup" onclick="toggleUvLogsFromHeader('setup')" style="cursor: pointer;">▶ uv-logs</span>
4202
  </span> |
4203
+ Cell: setup | 108.22s
4204
  | <button class="run-btn" onclick="runCell('setup')">▶ run</button>
4205
  <button class="copy-btn" onclick="copyCell('setup')">Copy</button>
4206
  <a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
 
4459
 
4460
  What is Tensor Parallelism?
4461
 
4462
+ &lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;analysis&lt;|message|&gt;We need to explain what Tensor Parallelism is. It&#x27;s a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices. Provide details: how it works, benefits, challenges, typical frameworks, etc. Also mention difference from data parallelism, pipeline parallelism. Provide example: splitting a weight matrix across GPUs, each GPU holds a slice, compute partial results, then gather. Provide mention of Megatron-LM, DeepSpeed, etc. Also mention that it&#x27;s used for very large models that don&#x27;t fit in a single GPU. Provide explanation of how it reduces memory usage and increases throughput. Also mention that it can be combined with other parallelism strategies. Provide mention of communication overhead, scaling, etc. Provide references. Provide code snippet maybe. Provide final answer.&lt;|end|&gt;&lt;|start|&gt;assistant&lt;|channel|&gt;final&lt;|message|&gt;## Tensor Parallelism – A Quick Overview
4463
+
4464
+ | **Aspect** | **What it is** | **Why it matters** |
4465
+ |------------|----------------|--------------------|
4466
+ | **Definition** | Splitting a *single* weight tensor (e.g., a large matrix in a transformer layer) across multiple devices so that each device holds only a *portion* of the tensor. | Allows training of models that are
4467
+ Generation took 26.28 seconds
4468
  </div>
4469
  <div class="uv-install-logs" id="uv-logs-setup">
4470
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 
4473
  Downloading cpython-3.13.7-linux-x86_64-gnu (download)
4474
  Updating https://github.com/huggingface/transformers.git (HEAD)
4475
  Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
4476
+ Downloading tokenizers (3.1MiB)
4477
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4478
  Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
4479
+ Downloading nvidia-curand-cu12 (60.7MiB)
4480
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4481
+ Downloading pygments (1.2MiB)
4482
+ Downloading sympy (6.0MiB)
4483
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4484
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4485
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4486
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4487
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4488
+ Downloading networkx (1.9MiB)
4489
+ Downloading kiwisolver (1.4MiB)
4490
  Downloading pillow (6.3MiB)
4491
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
4492
  Downloading fonttools (4.7MiB)
 
 
4493
  Downloading hf-xet (3.0MiB)
4494
+ Downloading numpy (15.9MiB)
4495
+ Downloading matplotlib (8.3MiB)
4496
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4497
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4498
+ Downloading jedi (1.5MiB)
4499
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
 
4500
  Downloading triton (148.4MiB)
4501
+ Downloading torch (846.8MiB)
 
 
 
 
 
 
4502
  Downloading nvidia-cufile-cu12
4503
  Downloading kiwisolver
4504
  Downloading pygments
 
4505
  Downloading tokenizers
4506
+ Downloading hf-xet
4507
  Downloading networkx
4508
  Downloading fonttools
4509
  Downloading pillow
 
4519
  Downloading triton
4520
  Downloading nvidia-cufft-cu12
4521
  Downloading nvidia-cusolver-cu12
 
4522
  Downloading nvidia-cusparse-cu12
4523
+ Downloading nvidia-cusparselt-cu12
4524
  Downloading nvidia-nccl-cu12
4525
  Downloading nvidia-cublas-cu12
4526
  Downloading nvidia-cudnn-cu12
4527
  Downloading torch
4528
+ Installed 69 packages in 462ms
4529
  </div>
4530
  </div>
4531
  <div class="cell-stderr">Fetching 3 files: 0%| | 0/3 [00:00&lt;?, ?it/s]
4532
+ Fetching 3 files: 33%|███▎ | 1/3 [00:07&lt;00:14, 7.36s/it]
4533
+ Fetching 3 files: 67%|██████▋ | 2/3 [00:08&lt;00:03, 3.69s/it]
4534
+ Fetching 3 files: 100%|██████████| 3/3 [00:08&lt;00:00, 2.83s/it]
4535
  You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
4536
 
4537
  Loading checkpoint shards: 0%| | 0/3 [00:00&lt;?, ?it/s]
4538
+ Loading checkpoint shards: 33%|███▎ | 1/3 [00:02&lt;00:04, 2.36s/it]
4539
+ Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04&lt;00:02, 2.26s/it]
4540
  Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.80s/it]
4541
+ Loading checkpoint shards: 100%|██████████| 3/3 [00:05&lt;00:00, 1.94s/it]
4542
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4543
 
4544
  Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
4545
+ Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:01, 2.82it/s]
4546
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 11.61it/s]
4547
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 10.04it/s]
4548
+ /tmp/uvnote-run-ga2bg_po/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4549
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4550
  warnings.warn(
4551
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
 
4572
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4573
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4574
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
4575
+ /tmp/uvnote-run-ga2bg_po/home/.cache/uv/environments-v2/setup-1400c3ff0fc01263/lib/python3.13/site-packages/kernels/layer.py:868: UserWarning:
4576
  No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
4577
  warnings.warn(
4578
  INFO:root:Using layer `Yamoe` from repo `drbh/yamoe` (revision: v0.3.0) for layer `Yamoe`
megablocks_yamoe/torch_profile.html CHANGED
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
3720
  <span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
3721
  <span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
3722
  </span> |
3723
- Cell: utils | deps: torch, numpy | 34.17s
3724
  | <button class="run-btn" onclick="runCell('utils')">▶ run</button>
3725
  <button class="copy-btn" onclick="copyCell('utils')">Copy</button>
3726
  <a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
@@ -3794,24 +3794,24 @@ Cell: utils | deps: torch, numpy | 34.17s
3794
  <div class="uv-install-logs" id="uv-logs-utils">
3795
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3796
  <div class="uv-logs-content" style="display: none;">
3797
- Downloading sympy (6.0MiB)
3798
- Downloading nvidia-cublas-cu12 (566.8MiB)
3799
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
3800
- Downloading nvidia-cufft-cu12 (184.2MiB)
3801
  Downloading setuptools (1.1MiB)
3802
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
3803
- Downloading networkx (1.9MiB)
3804
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
3805
  Downloading numpy (16.2MiB)
3806
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3807
- Downloading nvidia-nccl-cu12 (307.4MiB)
3808
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3809
- Downloading nvidia-cusolver-cu12 (255.1MiB)
3810
- Downloading torch (846.9MiB)
3811
  Downloading nvidia-curand-cu12 (60.7MiB)
3812
- Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
3813
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
3814
  Downloading triton (148.3MiB)
 
 
3815
  Downloading nvidia-cufile-cu12
3816
  Downloading setuptools
3817
  Downloading networkx
@@ -3824,13 +3824,13 @@ Downloading triton (148.3MiB)
3824
  Downloading triton
3825
  Downloading nvidia-cufft-cu12
3826
  Downloading nvidia-cusolver-cu12
3827
- Downloading nvidia-cusparse-cu12
3828
  Downloading nvidia-cusparselt-cu12
 
3829
  Downloading nvidia-nccl-cu12
3830
  Downloading nvidia-cublas-cu12
3831
  Downloading nvidia-cudnn-cu12
3832
  Downloading torch
3833
- Installed 26 packages in 465ms
3834
  </div>
3835
  </div>
3836
  </div>
@@ -3843,7 +3843,7 @@ Installed 26 packages in 465ms
3843
  <span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
- Cell: bench_utils | deps: torch, numpy | 34.13s
3847
  | <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
3849
  <a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 34.13s
4331
  <div class="uv-install-logs" id="uv-logs-bench_utils">
4332
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4333
  <div class="uv-logs-content" style="display: none;">
4334
- Downloading sympy (6.0MiB)
 
 
 
 
4335
  Downloading setuptools (1.1MiB)
4336
  Downloading nvidia-cufft-cu12 (184.2MiB)
4337
- Downloading triton (148.3MiB)
4338
- Downloading nvidia-cublas-cu12 (566.8MiB)
4339
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4340
- Downloading numpy (16.2MiB)
4341
- Downloading networkx (1.9MiB)
4342
- Downloading nvidia-nccl-cu12 (307.4MiB)
4343
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4344
  Downloading nvidia-cufile-cu12 (1.1MiB)
4345
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
 
 
 
4346
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4347
- Downloading nvidia-curand-cu12 (60.7MiB)
4348
- Downloading torch (846.9MiB)
4349
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4350
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4351
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
4352
  Downloading nvidia-cufile-cu12
4353
  Downloading setuptools
4354
  Downloading networkx
@@ -4361,8 +4361,8 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4361
  Downloading triton
4362
  Downloading nvidia-cufft-cu12
4363
  Downloading nvidia-cusolver-cu12
4364
- Downloading nvidia-cusparse-cu12
4365
  Downloading nvidia-cusparselt-cu12
 
4366
  Downloading nvidia-nccl-cu12
4367
  Downloading nvidia-cublas-cu12
4368
  Downloading nvidia-cudnn-cu12
@@ -4381,7 +4381,7 @@ Installed 26 packages in 445ms
4381
  <span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
4382
  <span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
4383
  </span> |
4384
- Cell: config | deps: torch, numpy | 35.83s
4385
  | <button class="run-btn" onclick="runCell('config')">▶ run</button>
4386
  <button class="copy-btn" onclick="copyCell('config')">Copy</button>
4387
  <a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
@@ -4442,23 +4442,23 @@ Cell: config | deps: torch, numpy | 35.83s
4442
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4443
  <div class="uv-logs-content" style="display: none;">
4444
  Downloading sympy (6.0MiB)
4445
- Downloading networkx (1.9MiB)
4446
- Downloading setuptools (1.1MiB)
4447
  Downloading nvidia-cufile-cu12 (1.1MiB)
4448
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4449
- Downloading torch (846.9MiB)
4450
  Downloading nvidia-nccl-cu12 (307.4MiB)
4451
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4452
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
4453
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4454
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4455
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4456
- Downloading nvidia-cublas-cu12 (566.8MiB)
4457
- Downloading numpy (16.2MiB)
4458
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
 
4459
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
4460
  Downloading triton (148.3MiB)
4461
- Downloading nvidia-curand-cu12 (60.7MiB)
4462
  Downloading nvidia-cufile-cu12
4463
  Downloading setuptools
4464
  Downloading networkx
@@ -4471,13 +4471,13 @@ Downloading nvidia-curand-cu12 (60.7MiB)
4471
  Downloading triton
4472
  Downloading nvidia-cufft-cu12
4473
  Downloading nvidia-cusolver-cu12
4474
- Downloading nvidia-cusparselt-cu12
4475
  Downloading nvidia-cusparse-cu12
 
4476
  Downloading nvidia-nccl-cu12
4477
  Downloading nvidia-cublas-cu12
4478
  Downloading nvidia-cudnn-cu12
4479
  Downloading torch
4480
- Installed 26 packages in 564ms
4481
  </div>
4482
  </div>
4483
  </div>
@@ -4490,7 +4490,7 @@ Installed 26 packages in 564ms
4490
  <span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
4491
  <span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
4492
  </span> |
4493
- Cell: save_data | deps: torch, numpy | 39.38s
4494
  | <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
4495
  <button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
4496
  <a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
4585
  <div class="uv-install-logs" id="uv-logs-save_data">
4586
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4587
  <div class="uv-logs-content" style="display: none;">
4588
- Downloading networkx (1.9MiB)
4589
- Downloading setuptools (1.1MiB)
4590
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4591
- Downloading sympy (6.0MiB)
4592
- Downloading numpy (16.2MiB)
4593
- Downloading torch (846.9MiB)
4594
- Downloading nvidia-cufile-cu12 (1.1MiB)
4595
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4596
- Downloading nvidia-cufft-cu12 (184.2MiB)
 
4597
  Downloading nvidia-cublas-cu12 (566.8MiB)
4598
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4599
- Downloading nvidia-nccl-cu12 (307.4MiB)
4600
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4601
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4602
- Downloading triton (148.3MiB)
4603
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4604
  Downloading nvidia-curand-cu12 (60.7MiB)
4605
  Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
 
 
 
 
 
 
 
 
4606
  Downloading nvidia-cufile-cu12
4607
  Downloading setuptools
4608
  Downloading networkx
@@ -4621,16 +4621,16 @@ Downloading nvidia-cusparse-cu12 (274.9MiB)
4621
  Downloading nvidia-cublas-cu12
4622
  Downloading nvidia-cudnn-cu12
4623
  Downloading torch
4624
- Installed 26 packages in 447ms
4625
  </div>
4626
  </div>
4627
  <div class="cell-artifacts">
4628
  <h4>Artifacts:</h4>
4629
- <a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
4630
- <a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
4631
- <a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
4632
  <a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
 
4633
  <a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
 
 
4634
  <a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
4635
  </div>
4636
  </div>
@@ -4645,7 +4645,7 @@ Installed 26 packages in 447ms
4645
  <span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
4646
  <span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
4647
  </span> |
4648
- Cell: yamoe_run | deps: torch, kernels, numpy | 38.93s
4649
  | <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
4650
  <button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
4651
  <a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -4938,9 +4938,9 @@ Input Variation: +0.001 * iteration (deterministic)
4938
 
4939
  Warming up (10 iterations)...
4940
  Benchmarking (50 iterations)...
4941
- Progress: 20% complete (avg: 4.247 ms)
4942
- Progress: 40% complete (avg: 4.247 ms)
4943
- Progress: 60% complete (avg: 4.250 ms)
4944
  Progress: 80% complete (avg: 4.249 ms)
4945
 
4946
  Output tensors:
@@ -4951,19 +4951,19 @@ Output tensors:
4951
  Iterations: 50
4952
 
4953
  Latency Statistics:
4954
- Average: 4.249 ms
4955
- Min: 4.131 ms
4956
- Max: 4.305 ms
4957
- Std Dev: 0.028 ms
4958
 
4959
  Percentiles:
4960
- P50 (median): 4.250 ms
4961
- P95: 4.289 ms
4962
- P99: 4.300 ms
4963
 
4964
  Throughput:
4965
- Tokens/sec: 23533.0
4966
- Std Dev: 154.4
4967
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4968
 
4969
  Saved benchmark results to yamoe_results.json
@@ -4973,25 +4973,25 @@ Output sum: 3.971905
4973
  <div class="uv-install-logs" id="uv-logs-yamoe_run">
4974
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4975
  <div class="uv-logs-content" style="display: none;">
4976
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
4977
  Downloading networkx (1.9MiB)
4978
- Downloading numpy (16.2MiB)
 
 
 
 
4979
  Downloading nvidia-cusolver-cu12 (255.1MiB)
 
4980
  Downloading torch (846.9MiB)
 
 
4981
  Downloading nvidia-curand-cu12 (60.7MiB)
4982
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4983
- Downloading setuptools (1.1MiB)
4984
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4985
- Downloading nvidia-nccl-cu12 (307.4MiB)
4986
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4987
- Downloading hf-xet (3.0MiB)
4988
- Downloading nvidia-cufft-cu12 (184.2MiB)
4989
- Downloading nvidia-cufile-cu12 (1.1MiB)
4990
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
4991
  Downloading triton (148.3MiB)
4992
- Downloading sympy (6.0MiB)
4993
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4994
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4995
  Downloading nvidia-cufile-cu12
4996
  Downloading hf-xet
4997
  Downloading setuptools
@@ -5011,14 +5011,13 @@ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5011
  Downloading nvidia-cublas-cu12
5012
  Downloading nvidia-cudnn-cu12
5013
  Downloading torch
5014
- Installed 37 packages in 451ms
5015
  </div>
5016
  </div>
5017
  <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
5018
- Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:00, 5.17it/s]
5019
- Fetching 6 files: 33%|███▎ | 2/6 [00:00&lt;00:01, 3.77it/s]
5020
- Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 4.24it/s]
5021
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 8.45it/s]</div>
5022
  <div class="cell-artifacts">
5023
  <h4>Artifacts:</h4>
5024
  <a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
@@ -5035,7 +5034,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 8.4
5035
  <span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
5036
  <span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
5037
  </span> |
5038
- Cell: binned_run | deps: torch, numpy | 39.10s
5039
  | <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
5040
  <button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
5041
  <a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -5449,10 +5448,10 @@ Input Variation: +0.001 * iteration (deterministic)
5449
 
5450
  Warming up (10 iterations)...
5451
  Benchmarking (50 iterations)...
5452
- Progress: 20% complete (avg: 38.434 ms)
5453
- Progress: 40% complete (avg: 38.074 ms)
5454
- Progress: 60% complete (avg: 37.541 ms)
5455
- Progress: 80% complete (avg: 36.952 ms)
5456
 
5457
  Output tensors:
5458
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
@@ -5462,19 +5461,19 @@ Output tensors:
5462
  Iterations: 50
5463
 
5464
  Latency Statistics:
5465
- Average: 36.479 ms
5466
- Min: 33.550 ms
5467
- Max: 39.617 ms
5468
- Std Dev: 1.587 ms
5469
 
5470
  Percentiles:
5471
- P50 (median): 36.436 ms
5472
- P95: 39.168 ms
5473
- P99: 39.480 ms
5474
 
5475
  Throughput:
5476
- Tokens/sec: 2741.3
5477
- Std Dev: 119.0
5478
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5479
 
5480
  Saved benchmark results to binned_results.json
@@ -5484,24 +5483,24 @@ Output sum: 3.971905
5484
  <div class="uv-install-logs" id="uv-logs-binned_run">
5485
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5486
  <div class="uv-logs-content" style="display: none;">
5487
- Downloading setuptools (1.1MiB)
5488
- Downloading nvidia-cufile-cu12 (1.1MiB)
5489
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5490
- Downloading nvidia-curand-cu12 (60.7MiB)
5491
  Downloading sympy (6.0MiB)
5492
- Downloading nvidia-cusparse-cu12 (274.9MiB)
 
 
 
 
5493
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
5494
  Downloading nvidia-cublas-cu12 (566.8MiB)
5495
- Downloading triton (148.3MiB)
5496
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5497
  Downloading nvidia-cusolver-cu12 (255.1MiB)
5498
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5499
- Downloading nvidia-cudnn-cu12 (674.0MiB)
5500
  Downloading nvidia-cufft-cu12 (184.2MiB)
5501
- Downloading networkx (1.9MiB)
5502
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
5503
  Downloading torch (846.9MiB)
5504
- Downloading numpy (16.2MiB)
5505
  Downloading nvidia-cufile-cu12
5506
  Downloading setuptools
5507
  Downloading networkx
@@ -5520,7 +5519,7 @@ Downloading numpy (16.2MiB)
5520
  Downloading nvidia-cublas-cu12
5521
  Downloading nvidia-cudnn-cu12
5522
  Downloading torch
5523
- Installed 26 packages in 453ms
5524
  </div>
5525
  </div>
5526
  <div class="cell-artifacts">
@@ -5539,7 +5538,7 @@ Installed 26 packages in 453ms
5539
  <span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
5540
  <span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
5541
  </span> |
5542
- Cell: gptoss_run | deps: torch, numpy | 39.59s
5543
  | <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
5544
  <button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
5545
  <a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -5857,10 +5856,10 @@ Input Variation: +0.001 * iteration (deterministic)
5857
 
5858
  Warming up (10 iterations)...
5859
  Benchmarking (50 iterations)...
5860
- Progress: 20% complete (avg: 48.022 ms)
5861
- Progress: 40% complete (avg: 47.956 ms)
5862
- Progress: 60% complete (avg: 47.209 ms)
5863
- Progress: 80% complete (avg: 46.045 ms)
5864
 
5865
  Output tensors:
5866
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
@@ -5870,19 +5869,19 @@ Output tensors:
5870
  Iterations: 50
5871
 
5872
  Latency Statistics:
5873
- Average: 45.011 ms
5874
- Min: 39.029 ms
5875
- Max: 49.295 ms
5876
- Std Dev: 2.980 ms
5877
 
5878
  Percentiles:
5879
- P50 (median): 45.672 ms
5880
- P95: 48.489 ms
5881
- P99: 49.056 ms
5882
 
5883
  Throughput:
5884
- Tokens/sec: 2221.7
5885
- Std Dev: 151.3
5886
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5887
 
5888
  Saved benchmark results to gptoss_results.json
@@ -5892,24 +5891,24 @@ Output sum: 11.532237
5892
  <div class="uv-install-logs" id="uv-logs-gptoss_run">
5893
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5894
  <div class="uv-logs-content" style="display: none;">
 
 
 
 
 
5895
  Downloading sympy (6.0MiB)
5896
- Downloading nvidia-cusparse-cu12 (274.9MiB)
5897
  Downloading nvidia-nccl-cu12 (307.4MiB)
5898
- Downloading nvidia-cufile-cu12 (1.1MiB)
5899
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5900
- Downloading networkx (1.9MiB)
5901
- Downloading nvidia-curand-cu12 (60.7MiB)
5902
  Downloading nvidia-cudnn-cu12 (674.0MiB)
5903
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
5904
  Downloading triton (148.3MiB)
5905
- Downloading torch (846.9MiB)
5906
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
5907
- Downloading nvidia-cusolver-cu12 (255.1MiB)
5908
  Downloading nvidia-cublas-cu12 (566.8MiB)
5909
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5910
- Downloading setuptools (1.1MiB)
5911
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5912
- Downloading numpy (16.2MiB)
5913
  Downloading nvidia-cufile-cu12
5914
  Downloading setuptools
5915
  Downloading networkx
@@ -5928,7 +5927,7 @@ Downloading numpy (16.2MiB)
5928
  Downloading nvidia-cublas-cu12
5929
  Downloading nvidia-cudnn-cu12
5930
  Downloading torch
5931
- Installed 26 packages in 451ms
5932
  </div>
5933
  </div>
5934
  <div class="cell-artifacts">
@@ -5947,7 +5946,7 @@ Installed 26 packages in 451ms
5947
  <span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
5948
  <span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
5949
  </span> |
5950
- Cell: gptoss_training_run | deps: torch, numpy | 39.07s
5951
  | <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
5952
  <button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
5953
  <a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -6248,10 +6247,10 @@ Input Variation: +0.001 * iteration (deterministic)
6248
 
6249
  Warming up (10 iterations)...
6250
  Benchmarking (50 iterations)...
6251
- Progress: 20% complete (avg: 48.048 ms)
6252
- Progress: 40% complete (avg: 47.576 ms)
6253
- Progress: 60% complete (avg: 46.769 ms)
6254
- Progress: 80% complete (avg: 45.726 ms)
6255
 
6256
  Output tensors:
6257
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
@@ -6261,19 +6260,19 @@ Output tensors:
6261
  Iterations: 50
6262
 
6263
  Latency Statistics:
6264
- Average: 44.679 ms
6265
- Min: 38.109 ms
6266
- Max: 49.008 ms
6267
- Std Dev: 2.899 ms
6268
 
6269
  Percentiles:
6270
- P50 (median): 45.400 ms
6271
- P95: 48.408 ms
6272
- P99: 48.790 ms
6273
 
6274
  Throughput:
6275
- Tokens/sec: 2238.2
6276
- Std Dev: 150.3
6277
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
6278
 
6279
  Saved benchmark results to gptoss_training_results.json
@@ -6283,24 +6282,24 @@ Output sum: 11.532237
6283
  <div class="uv-install-logs" id="uv-logs-gptoss_training_run">
6284
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6285
  <div class="uv-logs-content" style="display: none;">
6286
- Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
6287
  Downloading numpy (16.2MiB)
6288
- Downloading nvidia-cublas-cu12 (566.8MiB)
 
6289
  Downloading nvidia-cudnn-cu12 (674.0MiB)
6290
- Downloading nvidia-cufile-cu12 (1.1MiB)
6291
  Downloading nvidia-cusolver-cu12 (255.1MiB)
6292
- Downloading triton (148.3MiB)
6293
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6294
  Downloading nvidia-cusparse-cu12 (274.9MiB)
6295
- Downloading networkx (1.9MiB)
6296
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6297
- Downloading setuptools (1.1MiB)
6298
- Downloading sympy (6.0MiB)
6299
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6300
- Downloading nvidia-curand-cu12 (60.7MiB)
6301
- Downloading nvidia-nccl-cu12 (307.4MiB)
 
 
6302
  Downloading torch (846.9MiB)
6303
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
6304
  Downloading nvidia-cufile-cu12
6305
  Downloading setuptools
6306
  Downloading networkx
@@ -6319,7 +6318,7 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB)
6319
  Downloading nvidia-cublas-cu12
6320
  Downloading nvidia-cudnn-cu12
6321
  Downloading torch
6322
- Installed 26 packages in 449ms
6323
  </div>
6324
  </div>
6325
  <div class="cell-artifacts">
@@ -6338,7 +6337,7 @@ Installed 26 packages in 449ms
6338
  <span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
6339
  <span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
6340
  </span> |
6341
- Cell: megablocks_run | deps: torch, numpy, kernels | 40.94s | FAILED
6342
  | <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
6343
  <button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
6344
  <a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -6493,7 +6492,7 @@ Cell: megablocks_run | deps: torch, numpy, kernels | 40.94s | FAILED
6493
  <span class="c1"># Attach loaded expert weights to the experts container</span>
6494
  <span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
6495
  <span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
6496
- <span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">128</span>
6497
  <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
6498
  <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
6499
  <span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
@@ -6570,25 +6569,25 @@ Warming up (10 iterations)...
6570
  <div class="uv-install-logs" id="uv-logs-megablocks_run">
6571
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6572
  <div class="uv-logs-content" style="display: none;">
 
 
 
6573
  Downloading networkx (1.9MiB)
6574
- Downloading nvidia-cufile-cu12 (1.1MiB)
6575
- Downloading nvidia-cusolver-cu12 (255.1MiB)
6576
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6577
- Downloading nvidia-cusparse-cu12 (274.9MiB)
6578
- Downloading triton (148.3MiB)
6579
- Downloading numpy (16.2MiB)
6580
- Downloading nvidia-cufft-cu12 (184.2MiB)
6581
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
6582
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6583
- Downloading nvidia-cublas-cu12 (566.8MiB)
6584
- Downloading nvidia-cudnn-cu12 (674.0MiB)
6585
- Downloading nvidia-curand-cu12 (60.7MiB)
6586
- Downloading sympy (6.0MiB)
6587
- Downloading setuptools (1.1MiB)
6588
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6589
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
6590
  Downloading hf-xet (3.0MiB)
6591
- Downloading torch (846.9MiB)
 
 
 
 
 
6592
  Downloading nvidia-cufile-cu12
6593
  Downloading hf-xet
6594
  Downloading setuptools
@@ -6608,22 +6607,20 @@ Downloading torch (846.9MiB)
6608
  Downloading nvidia-cublas-cu12
6609
  Downloading nvidia-cudnn-cu12
6610
  Downloading torch
6611
- Installed 37 packages in 449ms
6612
  </div>
6613
  </div>
6614
  <div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
6615
- Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:25, 2.52it/s]
6616
- Fetching 66 files: 6%|▌ | 4/66 [00:00&lt;00:06, 9.13it/s]
6617
- Fetching 66 files: 15%|█▌ | 10/66 [00:00&lt;00:03, 16.75it/s]
6618
- Fetching 66 files: 21%|██ | 14/66 [00:00&lt;00:02, 19.64it/s]
6619
- Fetching 66 files: 26%|██▌ | 17/66 [00:01&lt;00:04, 12.25it/s]
6620
- Fetching 66 files: 45%|████▌ | 30/66 [00:01&lt;00:01, 26.10it/s]
6621
- Fetching 66 files: 59%|█████▉ | 39/66 [00:01&lt;00:00, 27.77it/s]
6622
- Fetching 66 files: 76%|███████▌ | 50/66 [00:01&lt;00:00, 38.40it/s]
6623
- Fetching 66 files: 85%|████████▍ | 56/66 [00:02&lt;00:00, 40.98it/s]
6624
- Fetching 66 files: 94%|█████████▍| 62/66 [00:02&lt;00:00, 36.13it/s]
6625
- Fetching 66 files: 100%|██████████| 66/66 [00:02&lt;00:00, 27.20it/s]
6626
- /tmp/tmps8crtj9h/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
6627
  5 | #include &lt;Python.h&gt;
6628
  | ^~~~~~~~~~
6629
  compilation terminated.
@@ -6640,87 +6637,87 @@ Traceback (most recent call last):
6640
  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py&quot;, line 177, in &lt;lambda&gt;
6641
  call = lambda x: fn(x, *args[1:], **kwargs)
6642
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
6643
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
6644
  return self._call_impl(*args, **kwargs)
6645
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6646
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
6647
  return forward_call(*args, **kwargs)
6648
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6649
  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py&quot;, line 81, in forward
6650
  output, dummy_routing_weights = self.model(hidden_states)
6651
  ^^^^^^^^^^^^^^^^^^^^^^^^^
6652
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
6653
  return self._call_impl(*args, **kwargs)
6654
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6655
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
6656
  return forward_call(*args, **kwargs)
6657
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6658
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 896, in forward
6659
  output, expert_weights_out, *_ = moe_forward(
6660
  ^^^^^^^^^^^^
6661
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 730, in moe_forward
6662
  x, tokens_per_expert = forward_fn(**forward_args)
6663
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
6664
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 457, in forward_once
6665
  x = permute_and_compute(
6666
  ^^^^^^^^^^^^^^^^^^^^
6667
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 401, in permute_and_compute
6668
  x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
6669
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6670
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/autograd/function.py&quot;, line 576, in apply
6671
  return super().apply(*args, **kwargs) # type: ignore[misc]
6672
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6673
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py&quot;, line 30, in decorate_fwd
6674
  return fwd(*args, **kwargs)
6675
  ^^^^^^^^^^^^^^^^^^^^
6676
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py&quot;, line 26, in forward
6677
  return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
6678
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6679
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py&quot;, line 419, in binned_gather
6680
  _binned_copy[(num_experts, expert_capacity)](
6681
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/jit.py&quot;, line 390, in &lt;lambda&gt;
6682
  return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
6683
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6684
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 239, in run
6685
  benchmark()
6686
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in benchmark
6687
  timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
6688
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6689
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in &lt;dictcomp&gt;
6690
  timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
6691
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6692
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 160, in _bench
6693
  return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
6694
  ^^^^^^^^^^^^^
6695
  File &quot;/usr/lib/python3.11/functools.py&quot;, line 1001, in __get__
6696
  val = self.func(instance)
6697
  ^^^^^^^^^^^^^^^^^^^
6698
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 121, in do_bench
6699
  return driver.active.get_benchmarker()
6700
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6701
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 30, in __getattr__
6702
  return getattr(self._initialize_obj(), name)
6703
  ^^^^^^^^^^^^^^^^^^^^^^
6704
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 26, in _initialize_obj
6705
  self._obj = self._init_fn()
6706
  ^^^^^^^^^^^^^^^
6707
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 12, in _create_driver
6708
  return active_drivers[0]()
6709
  ^^^^^^^^^^^^^^^^^^^
6710
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 715, in __init__
6711
  self.utils = CudaUtils() # TODO: make static
6712
  ^^^^^^^^^^^
6713
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 62, in __init__
6714
  mod = compile_module_from_src(
6715
  ^^^^^^^^^^^^^^^^^^^^^^^^
6716
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 88, in compile_module_from_src
6717
  so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
6718
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6719
- File &quot;/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 51, in _build
6720
  subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
6721
  File &quot;/usr/lib/python3.11/subprocess.py&quot;, line 413, in check_call
6722
  raise CalledProcessError(retcode, cmd)
6723
- subprocess.CalledProcessError: Command &#x27;[&#x27;/usr/bin/gcc&#x27;, &#x27;/tmp/tmps8crtj9h/cuda_utils.c&#x27;, &#x27;-O3&#x27;, &#x27;-shared&#x27;, &#x27;-fPIC&#x27;, &#x27;-Wno-psabi&#x27;, &#x27;-o&#x27;, &#x27;/tmp/tmps8crtj9h/cuda_utils.cpython-311-x86_64-linux-gnu.so&#x27;, &#x27;-lcuda&#x27;, &#x27;-L/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/lib&#x27;, &#x27;-L/usr/lib/x86_64-linux-gnu&#x27;, &#x27;-I/tmp/uvnote-run-_d5r222t/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/include&#x27;, &#x27;-I/tmp/tmps8crtj9h&#x27;, &#x27;-I/usr/include/python3.11&#x27;]&#x27; returned non-zero exit status 1.</div>
6724
  </div>
6725
  </div>
6726
 
 
3720
  <span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
3721
  <span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
3722
  </span> |
3723
+ Cell: utils | deps: torch, numpy | 34.25s
3724
  | <button class="run-btn" onclick="runCell('utils')">▶ run</button>
3725
  <button class="copy-btn" onclick="copyCell('utils')">Copy</button>
3726
  <a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
 
3794
  <div class="uv-install-logs" id="uv-logs-utils">
3795
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
3796
  <div class="uv-logs-content" style="display: none;">
 
 
3797
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
3798
  Downloading setuptools (1.1MiB)
3799
+ Downloading nvidia-cufft-cu12 (184.2MiB)
3800
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
3801
  Downloading nvidia-cufile-cu12 (1.1MiB)
3802
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
3803
  Downloading numpy (16.2MiB)
 
 
3804
  Downloading nvidia-cudnn-cu12 (674.0MiB)
3805
+ Downloading nvidia-cublas-cu12 (566.8MiB)
 
3806
  Downloading nvidia-curand-cu12 (60.7MiB)
3807
+ Downloading nvidia-nccl-cu12 (307.4MiB)
3808
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
3809
+ Downloading sympy (6.0MiB)
3810
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
3811
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
3812
  Downloading triton (148.3MiB)
3813
+ Downloading torch (846.9MiB)
3814
+ Downloading networkx (1.9MiB)
3815
  Downloading nvidia-cufile-cu12
3816
  Downloading setuptools
3817
  Downloading networkx
 
3824
  Downloading triton
3825
  Downloading nvidia-cufft-cu12
3826
  Downloading nvidia-cusolver-cu12
 
3827
  Downloading nvidia-cusparselt-cu12
3828
+ Downloading nvidia-cusparse-cu12
3829
  Downloading nvidia-nccl-cu12
3830
  Downloading nvidia-cublas-cu12
3831
  Downloading nvidia-cudnn-cu12
3832
  Downloading torch
3833
+ Installed 26 packages in 446ms
3834
  </div>
3835
  </div>
3836
  </div>
 
3843
  <span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
3844
  <span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
3845
  </span> |
3846
+ Cell: bench_utils | deps: torch, numpy | 35.45s
3847
  | <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
3848
  <button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
3849
  <a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
 
4331
  <div class="uv-install-logs" id="uv-logs-bench_utils">
4332
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4333
  <div class="uv-logs-content" style="display: none;">
4334
+ Downloading numpy (16.2MiB)
4335
+ Downloading torch (846.9MiB)
4336
+ Downloading triton (148.3MiB)
4337
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4338
+ Downloading nvidia-curand-cu12 (60.7MiB)
4339
  Downloading setuptools (1.1MiB)
4340
  Downloading nvidia-cufft-cu12 (184.2MiB)
 
 
 
 
 
 
 
4341
  Downloading nvidia-cufile-cu12 (1.1MiB)
4342
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4343
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4344
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4345
+ Downloading sympy (6.0MiB)
4346
+ Downloading networkx (1.9MiB)
4347
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
4348
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4349
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4350
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4351
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4352
  Downloading nvidia-cufile-cu12
4353
  Downloading setuptools
4354
  Downloading networkx
 
4361
  Downloading triton
4362
  Downloading nvidia-cufft-cu12
4363
  Downloading nvidia-cusolver-cu12
 
4364
  Downloading nvidia-cusparselt-cu12
4365
+ Downloading nvidia-cusparse-cu12
4366
  Downloading nvidia-nccl-cu12
4367
  Downloading nvidia-cublas-cu12
4368
  Downloading nvidia-cudnn-cu12
 
4381
  <span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
4382
  <span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
4383
  </span> |
4384
+ Cell: config | deps: torch, numpy | 34.31s
4385
  | <button class="run-btn" onclick="runCell('config')">▶ run</button>
4386
  <button class="copy-btn" onclick="copyCell('config')">Copy</button>
4387
  <a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
 
4442
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4443
  <div class="uv-logs-content" style="display: none;">
4444
  Downloading sympy (6.0MiB)
 
 
4445
  Downloading nvidia-cufile-cu12 (1.1MiB)
 
 
4446
  Downloading nvidia-nccl-cu12 (307.4MiB)
 
4447
  Downloading nvidia-cudnn-cu12 (674.0MiB)
4448
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4449
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
4450
+ Downloading torch (846.9MiB)
4451
+ Downloading networkx (1.9MiB)
 
 
4452
  Downloading nvidia-cufft-cu12 (184.2MiB)
4453
+ Downloading setuptools (1.1MiB)
4454
+ Downloading nvidia-curand-cu12 (60.7MiB)
4455
+ Downloading nvidia-cublas-cu12 (566.8MiB)
4456
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4457
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4458
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4459
+ Downloading numpy (16.2MiB)
4460
  Downloading triton (148.3MiB)
4461
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4462
  Downloading nvidia-cufile-cu12
4463
  Downloading setuptools
4464
  Downloading networkx
 
4471
  Downloading triton
4472
  Downloading nvidia-cufft-cu12
4473
  Downloading nvidia-cusolver-cu12
 
4474
  Downloading nvidia-cusparse-cu12
4475
+ Downloading nvidia-cusparselt-cu12
4476
  Downloading nvidia-nccl-cu12
4477
  Downloading nvidia-cublas-cu12
4478
  Downloading nvidia-cudnn-cu12
4479
  Downloading torch
4480
+ Installed 26 packages in 450ms
4481
  </div>
4482
  </div>
4483
  </div>
 
4490
  <span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
4491
  <span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
4492
  </span> |
4493
+ Cell: save_data | deps: torch, numpy | 39.54s
4494
  | <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
4495
  <button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
4496
  <a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
 
4585
  <div class="uv-install-logs" id="uv-logs-save_data">
4586
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4587
  <div class="uv-logs-content" style="display: none;">
4588
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
 
 
 
 
 
 
4589
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4590
+ Downloading numpy (16.2MiB)
4591
+ Downloading setuptools (1.1MiB)
4592
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
4593
  Downloading nvidia-cudnn-cu12 (674.0MiB)
 
 
 
4594
  Downloading nvidia-curand-cu12 (60.7MiB)
4595
  Downloading nvidia-cusparse-cu12 (274.9MiB)
4596
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4597
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4598
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4599
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4600
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
4601
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4602
+ Downloading sympy (6.0MiB)
4603
+ Downloading torch (846.9MiB)
4604
+ Downloading networkx (1.9MiB)
4605
+ Downloading triton (148.3MiB)
4606
  Downloading nvidia-cufile-cu12
4607
  Downloading setuptools
4608
  Downloading networkx
 
4621
  Downloading nvidia-cublas-cu12
4622
  Downloading nvidia-cudnn-cu12
4623
  Downloading torch
4624
+ Installed 26 packages in 446ms
4625
  </div>
4626
  </div>
4627
  <div class="cell-artifacts">
4628
  <h4>Artifacts:</h4>
 
 
 
4629
  <a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
4630
+ <a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
4631
  <a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
4632
+ <a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
4633
+ <a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
4634
  <a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
4635
  </div>
4636
  </div>
 
4645
  <span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
4646
  <span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
4647
  </span> |
4648
+ Cell: yamoe_run | deps: torch, kernels, numpy | 39.10s
4649
  | <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
4650
  <button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
4651
  <a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
 
4938
 
4939
  Warming up (10 iterations)...
4940
  Benchmarking (50 iterations)...
4941
+ Progress: 20% complete (avg: 4.251 ms)
4942
+ Progress: 40% complete (avg: 4.248 ms)
4943
+ Progress: 60% complete (avg: 4.248 ms)
4944
  Progress: 80% complete (avg: 4.249 ms)
4945
 
4946
  Output tensors:
 
4951
  Iterations: 50
4952
 
4953
  Latency Statistics:
4954
+ Average: 4.250 ms
4955
+ Min: 4.144 ms
4956
+ Max: 4.276 ms
4957
+ Std Dev: 0.020 ms
4958
 
4959
  Percentiles:
4960
+ P50 (median): 4.252 ms
4961
+ P95: 4.269 ms
4962
+ P99: 4.276 ms
4963
 
4964
  Throughput:
4965
+ Tokens/sec: 23531.6
4966
+ Std Dev: 113.9
4967
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
4968
 
4969
  Saved benchmark results to yamoe_results.json
 
4973
  <div class="uv-install-logs" id="uv-logs-yamoe_run">
4974
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4975
  <div class="uv-logs-content" style="display: none;">
4976
+ Downloading hf-xet (3.0MiB)
4977
+ Downloading nvidia-nccl-cu12 (307.4MiB)
4978
  Downloading networkx (1.9MiB)
4979
+ Downloading nvidia-cufft-cu12 (184.2MiB)
4980
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
4981
+ Downloading setuptools (1.1MiB)
4982
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
4983
+ Downloading nvidia-cufile-cu12 (1.1MiB)
4984
  Downloading nvidia-cusolver-cu12 (255.1MiB)
4985
+ Downloading numpy (16.2MiB)
4986
  Downloading torch (846.9MiB)
4987
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4988
+ Downloading sympy (6.0MiB)
4989
  Downloading nvidia-curand-cu12 (60.7MiB)
 
 
 
 
 
 
 
 
4990
  Downloading nvidia-cublas-cu12 (566.8MiB)
4991
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4992
  Downloading triton (148.3MiB)
4993
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
4994
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
4995
  Downloading nvidia-cufile-cu12
4996
  Downloading hf-xet
4997
  Downloading setuptools
 
5011
  Downloading nvidia-cublas-cu12
5012
  Downloading nvidia-cudnn-cu12
5013
  Downloading torch
5014
+ Installed 37 packages in 454ms
5015
  </div>
5016
  </div>
5017
  <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
5018
+ Fetching 6 files: 17%|█▋ | 1/6 [00:00&lt;00:01, 3.47it/s]
5019
+ Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 4.22it/s]
5020
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 8.26it/s]</div>
 
5021
  <div class="cell-artifacts">
5022
  <h4>Artifacts:</h4>
5023
  <a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
 
5034
  <span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
5035
  <span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
5036
  </span> |
5037
+ Cell: binned_run | deps: torch, numpy | 39.44s
5038
  | <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
5039
  <button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
5040
  <a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
 
5448
 
5449
  Warming up (10 iterations)...
5450
  Benchmarking (50 iterations)...
5451
+ Progress: 20% complete (avg: 37.889 ms)
5452
+ Progress: 40% complete (avg: 37.238 ms)
5453
+ Progress: 60% complete (avg: 36.997 ms)
5454
+ Progress: 80% complete (avg: 36.387 ms)
5455
 
5456
  Output tensors:
5457
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
 
5461
  Iterations: 50
5462
 
5463
  Latency Statistics:
5464
+ Average: 35.833 ms
5465
+ Min: 32.582 ms
5466
+ Max: 40.501 ms
5467
+ Std Dev: 1.694 ms
5468
 
5469
  Percentiles:
5470
+ P50 (median): 36.177 ms
5471
+ P95: 38.671 ms
5472
+ P99: 39.929 ms
5473
 
5474
  Throughput:
5475
+ Tokens/sec: 2790.7
5476
+ Std Dev: 131.3
5477
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5478
 
5479
  Saved benchmark results to binned_results.json
 
5483
  <div class="uv-install-logs" id="uv-logs-binned_run">
5484
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5485
  <div class="uv-logs-content" style="display: none;">
 
 
5486
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
5487
  Downloading sympy (6.0MiB)
5488
+ Downloading nvidia-cufile-cu12 (1.1MiB)
5489
+ Downloading setuptools (1.1MiB)
5490
+ Downloading numpy (16.2MiB)
5491
+ Downloading networkx (1.9MiB)
5492
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
5493
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
5494
  Downloading nvidia-cublas-cu12 (566.8MiB)
5495
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
 
5496
  Downloading nvidia-cusolver-cu12 (255.1MiB)
5497
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
5498
  Downloading nvidia-cufft-cu12 (184.2MiB)
5499
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
5500
  Downloading nvidia-nccl-cu12 (307.4MiB)
5501
+ Downloading nvidia-curand-cu12 (60.7MiB)
5502
  Downloading torch (846.9MiB)
5503
+ Downloading triton (148.3MiB)
5504
  Downloading nvidia-cufile-cu12
5505
  Downloading setuptools
5506
  Downloading networkx
 
5519
  Downloading nvidia-cublas-cu12
5520
  Downloading nvidia-cudnn-cu12
5521
  Downloading torch
5522
+ Installed 26 packages in 446ms
5523
  </div>
5524
  </div>
5525
  <div class="cell-artifacts">
 
5538
  <span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
5539
  <span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
5540
  </span> |
5541
+ Cell: gptoss_run | deps: torch, numpy | 40.46s
5542
  | <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
5543
  <button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
5544
  <a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
 
5856
 
5857
  Warming up (10 iterations)...
5858
  Benchmarking (50 iterations)...
5859
+ Progress: 20% complete (avg: 50.504 ms)
5860
+ Progress: 40% complete (avg: 50.045 ms)
5861
+ Progress: 60% complete (avg: 49.107 ms)
5862
+ Progress: 80% complete (avg: 48.012 ms)
5863
 
5864
  Output tensors:
5865
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
 
5869
  Iterations: 50
5870
 
5871
  Latency Statistics:
5872
+ Average: 46.791 ms
5873
+ Min: 39.036 ms
5874
+ Max: 50.857 ms
5875
+ Std Dev: 3.251 ms
5876
 
5877
  Percentiles:
5878
+ P50 (median): 47.476 ms
5879
+ P95: 50.806 ms
5880
+ P99: 50.839 ms
5881
 
5882
  Throughput:
5883
+ Tokens/sec: 2137.2
5884
+ Std Dev: 155.2
5885
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
5886
 
5887
  Saved benchmark results to gptoss_results.json
 
5891
  <div class="uv-install-logs" id="uv-logs-gptoss_run">
5892
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
5893
  <div class="uv-logs-content" style="display: none;">
5894
+ Downloading setuptools (1.1MiB)
5895
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
5896
+ Downloading nvidia-curand-cu12 (60.7MiB)
5897
+ Downloading numpy (16.2MiB)
5898
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
5899
  Downloading sympy (6.0MiB)
5900
+ Downloading torch (846.9MiB)
5901
  Downloading nvidia-nccl-cu12 (307.4MiB)
5902
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
5903
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
 
 
5904
  Downloading nvidia-cudnn-cu12 (674.0MiB)
5905
  Downloading nvidia-cufft-cu12 (184.2MiB)
5906
+ Downloading networkx (1.9MiB)
5907
+ Downloading nvidia-cufile-cu12 (1.1MiB)
5908
  Downloading triton (148.3MiB)
5909
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
5910
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
5911
  Downloading nvidia-cublas-cu12 (566.8MiB)
 
 
 
 
5912
  Downloading nvidia-cufile-cu12
5913
  Downloading setuptools
5914
  Downloading networkx
 
5927
  Downloading nvidia-cublas-cu12
5928
  Downloading nvidia-cudnn-cu12
5929
  Downloading torch
5930
+ Installed 26 packages in 442ms
5931
  </div>
5932
  </div>
5933
  <div class="cell-artifacts">
 
5946
  <span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
5947
  <span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
5948
  </span> |
5949
+ Cell: gptoss_training_run | deps: torch, numpy | 39.65s
5950
  | <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
5951
  <button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
5952
  <a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
 
6247
 
6248
  Warming up (10 iterations)...
6249
  Benchmarking (50 iterations)...
6250
+ Progress: 20% complete (avg: 48.334 ms)
6251
+ Progress: 40% complete (avg: 47.917 ms)
6252
+ Progress: 60% complete (avg: 47.077 ms)
6253
+ Progress: 80% complete (avg: 46.038 ms)
6254
 
6255
  Output tensors:
6256
  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
 
6260
  Iterations: 50
6261
 
6262
  Latency Statistics:
6263
+ Average: 45.007 ms
6264
+ Min: 38.837 ms
6265
+ Max: 49.308 ms
6266
+ Std Dev: 2.894 ms
6267
 
6268
  Percentiles:
6269
+ P50 (median): 45.575 ms
6270
+ P95: 48.573 ms
6271
+ P99: 48.964 ms
6272
 
6273
  Throughput:
6274
+ Tokens/sec: 2221.9
6275
+ Std Dev: 147.9
6276
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
6277
 
6278
  Saved benchmark results to gptoss_training_results.json
 
6282
  <div class="uv-install-logs" id="uv-logs-gptoss_training_run">
6283
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6284
  <div class="uv-logs-content" style="display: none;">
6285
+ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
6286
+ Downloading networkx (1.9MiB)
6287
+ Downloading setuptools (1.1MiB)
6288
  Downloading numpy (16.2MiB)
6289
+ Downloading nvidia-cusparselt-cu12 (273.9MiB)
6290
+ Downloading nvidia-curand-cu12 (60.7MiB)
6291
  Downloading nvidia-cudnn-cu12 (674.0MiB)
6292
+ Downloading sympy (6.0MiB)
6293
  Downloading nvidia-cusolver-cu12 (255.1MiB)
6294
+ Downloading nvidia-nccl-cu12 (307.4MiB)
 
6295
  Downloading nvidia-cusparse-cu12 (274.9MiB)
6296
+ Downloading nvidia-nvjitlink-cu12 (37.4MiB)
 
 
 
6297
  Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
6298
+ Downloading nvidia-cufft-cu12 (184.2MiB)
6299
+ Downloading nvidia-cufile-cu12 (1.1MiB)
6300
+ Downloading nvidia-cublas-cu12 (566.8MiB)
6301
+ Downloading triton (148.3MiB)
6302
  Downloading torch (846.9MiB)
 
6303
  Downloading nvidia-cufile-cu12
6304
  Downloading setuptools
6305
  Downloading networkx
 
6318
  Downloading nvidia-cublas-cu12
6319
  Downloading nvidia-cudnn-cu12
6320
  Downloading torch
6321
+ Installed 26 packages in 448ms
6322
  </div>
6323
  </div>
6324
  <div class="cell-artifacts">
 
6337
  <span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
6338
  <span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
6339
  </span> |
6340
+ Cell: megablocks_run | deps: torch, numpy, kernels | 41.38s | FAILED
6341
  | <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
6342
  <button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
6343
  <a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
 
6492
  <span class="c1"># Attach loaded expert weights to the experts container</span>
6493
  <span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
6494
  <span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
6495
+ <span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">64</span>
6496
  <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
6497
  <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
6498
  <span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
 
6569
  <div class="uv-install-logs" id="uv-logs-megablocks_run">
6570
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
6571
  <div class="uv-logs-content" style="display: none;">
6572
+ Downloading nvidia-cublas-cu12 (566.8MiB)
6573
+ Downloading setuptools (1.1MiB)
6574
+ Downloading numpy (16.2MiB)
6575
  Downloading networkx (1.9MiB)
6576
+ Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
 
6577
  Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
 
 
 
 
6578
  Downloading nvidia-cusparselt-cu12 (273.9MiB)
6579
  Downloading nvidia-nvjitlink-cu12 (37.4MiB)
6580
+ Downloading nvidia-cufile-cu12 (1.1MiB)
6581
+ Downloading torch (846.9MiB)
 
 
 
 
6582
  Downloading nvidia-nccl-cu12 (307.4MiB)
6583
+ Downloading nvidia-cufft-cu12 (184.2MiB)
6584
  Downloading hf-xet (3.0MiB)
6585
+ Downloading nvidia-curand-cu12 (60.7MiB)
6586
+ Downloading nvidia-cusparse-cu12 (274.9MiB)
6587
+ Downloading nvidia-cusolver-cu12 (255.1MiB)
6588
+ Downloading triton (148.3MiB)
6589
+ Downloading nvidia-cudnn-cu12 (674.0MiB)
6590
+ Downloading sympy (6.0MiB)
6591
  Downloading nvidia-cufile-cu12
6592
  Downloading hf-xet
6593
  Downloading setuptools
 
6607
  Downloading nvidia-cublas-cu12
6608
  Downloading nvidia-cudnn-cu12
6609
  Downloading torch
6610
+ Installed 37 packages in 543ms
6611
  </div>
6612
  </div>
6613
  <div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00&lt;?, ?it/s]
6614
+ Fetching 66 files: 2%|▏ | 1/66 [00:00&lt;00:27, 2.39it/s]
6615
+ Fetching 66 files: 6%|▌ | 4/66 [00:00&lt;00:07, 8.04it/s]
6616
+ Fetching 66 files: 17%|█▋ | 11/66 [00:00&lt;00:02, 21.45it/s]
6617
+ Fetching 66 files: 26%|██▌ | 17/66 [00:01&lt;00:02, 17.15it/s]
6618
+ Fetching 66 files: 48%|████▊ | 32/66 [00:01&lt;00:01, 30.72it/s]
6619
+ Fetching 66 files: 62%|██████▏ | 41/66 [00:01&lt;00:01, 23.83it/s]
6620
+ Fetching 66 files: 71%|███████ | 47/66 [00:02&lt;00:00, 25.88it/s]
6621
+ Fetching 66 files: 100%|██████████| 66/66 [00:02&lt;00:00, 45.13it/s]
6622
+ Fetching 66 files: 100%|██████████| 66/66 [00:02&lt;00:00, 29.34it/s]
6623
+ /tmp/tmpq5pei8xr/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
 
 
6624
  5 | #include &lt;Python.h&gt;
6625
  | ^~~~~~~~~~
6626
  compilation terminated.
 
6637
  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py&quot;, line 177, in &lt;lambda&gt;
6638
  call = lambda x: fn(x, *args[1:], **kwargs)
6639
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
6640
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
6641
  return self._call_impl(*args, **kwargs)
6642
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6643
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
6644
  return forward_call(*args, **kwargs)
6645
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6646
  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py&quot;, line 81, in forward
6647
  output, dummy_routing_weights = self.model(hidden_states)
6648
  ^^^^^^^^^^^^^^^^^^^^^^^^^
6649
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
6650
  return self._call_impl(*args, **kwargs)
6651
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6652
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
6653
  return forward_call(*args, **kwargs)
6654
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6655
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 896, in forward
6656
  output, expert_weights_out, *_ = moe_forward(
6657
  ^^^^^^^^^^^^
6658
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 730, in moe_forward
6659
  x, tokens_per_expert = forward_fn(**forward_args)
6660
  ^^^^^^^^^^^^^^^^^^^^^^^^^^
6661
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 457, in forward_once
6662
  x = permute_and_compute(
6663
  ^^^^^^^^^^^^^^^^^^^^
6664
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 401, in permute_and_compute
6665
  x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
6666
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6667
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/autograd/function.py&quot;, line 576, in apply
6668
  return super().apply(*args, **kwargs) # type: ignore[misc]
6669
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6670
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py&quot;, line 30, in decorate_fwd
6671
  return fwd(*args, **kwargs)
6672
  ^^^^^^^^^^^^^^^^^^^^
6673
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py&quot;, line 26, in forward
6674
  return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
6675
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6676
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py&quot;, line 419, in binned_gather
6677
  _binned_copy[(num_experts, expert_capacity)](
6678
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/jit.py&quot;, line 390, in &lt;lambda&gt;
6679
  return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
6680
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6681
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 239, in run
6682
  benchmark()
6683
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in benchmark
6684
  timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
6685
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6686
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in &lt;dictcomp&gt;
6687
  timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
6688
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6689
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 160, in _bench
6690
  return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
6691
  ^^^^^^^^^^^^^
6692
  File &quot;/usr/lib/python3.11/functools.py&quot;, line 1001, in __get__
6693
  val = self.func(instance)
6694
  ^^^^^^^^^^^^^^^^^^^
6695
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 121, in do_bench
6696
  return driver.active.get_benchmarker()
6697
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6698
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 30, in __getattr__
6699
  return getattr(self._initialize_obj(), name)
6700
  ^^^^^^^^^^^^^^^^^^^^^^
6701
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 26, in _initialize_obj
6702
  self._obj = self._init_fn()
6703
  ^^^^^^^^^^^^^^^
6704
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 12, in _create_driver
6705
  return active_drivers[0]()
6706
  ^^^^^^^^^^^^^^^^^^^
6707
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 715, in __init__
6708
  self.utils = CudaUtils() # TODO: make static
6709
  ^^^^^^^^^^^
6710
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 62, in __init__
6711
  mod = compile_module_from_src(
6712
  ^^^^^^^^^^^^^^^^^^^^^^^^
6713
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 88, in compile_module_from_src
6714
  so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
6715
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6716
+ File &quot;/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 51, in _build
6717
  subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
6718
  File &quot;/usr/lib/python3.11/subprocess.py&quot;, line 413, in check_call
6719
  raise CalledProcessError(retcode, cmd)
6720
+ subprocess.CalledProcessError: Command &#x27;[&#x27;/usr/bin/gcc&#x27;, &#x27;/tmp/tmpq5pei8xr/cuda_utils.c&#x27;, &#x27;-O3&#x27;, &#x27;-shared&#x27;, &#x27;-fPIC&#x27;, &#x27;-Wno-psabi&#x27;, &#x27;-o&#x27;, &#x27;/tmp/tmpq5pei8xr/cuda_utils.cpython-311-x86_64-linux-gnu.so&#x27;, &#x27;-lcuda&#x27;, &#x27;-L/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/lib&#x27;, &#x27;-L/usr/lib/x86_64-linux-gnu&#x27;, &#x27;-I/tmp/uvnote-run-ab5uowvg/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/include&#x27;, &#x27;-I/tmp/tmpq5pei8xr&#x27;, &#x27;-I/usr/include/python3.11&#x27;]&#x27; returned non-zero exit status 1.</div>
6721
  </div>
6722
  </div>
6723