mansaripo commited on
Commit
954e44f
·
verified ·
1 Parent(s): 96754f4

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -122,7 +122,7 @@ model = AutoModelForCausalLM.from_pretrained(
122
  "daslab-testing/CloverLM",
123
  trust_remote_code=True,
124
  dtype="bfloat16",
125
- quartet_2_impl="pseudoquant", # on non-Blackwell GPUs or "quartet2" for native NVFP4 kernel
126
  ).to("cuda") # for GPU usage or "cpu" for CPU usage
127
 
128
  tokenizer = AutoTokenizer.from_pretrained(
@@ -134,7 +134,6 @@ input_ids = tokenizer("The capital of France is", return_tensors="pt").input_ids
134
  output = model.generate(input_ids.to(model.device), max_new_tokens=32)
135
  print(tokenizer.decode(output[0]))
136
  ```
137
- Note that `quartet_2_impl="quartet2"` only supports inputs with `(micro_batch_size * seq_length) % 128 == 0`.
138
 
139
  ### Running Evaluations
140
 
@@ -165,7 +164,7 @@ Attention backend options: `pytorch` (default), `flash2`, `flash3`, `flash4`.
165
  - PyTorch 2.10+ with CUDA 13.0
166
  - `transformers ≥ 5.3.0`
167
  - `tokenmonster ≥ 1.1.12`
168
- - [Quartet II kernels](https://github.com/IST-DASLab/Quartet-II)
169
 
170
  ## Architecture Details
171
 
@@ -191,8 +190,8 @@ The model uses 264 weight tensors totaling ~4.14 B parameters.
191
  @article{cloverlm2026,
192
  title = {Speedrunning GPT3: Pretraining an OPT-175B-Quality Model Cheaply
193
  by Leveraging Native NVFP4},
194
- author = {Erik Schultheis and Georgios Vlassis and Matin Ansaripour and
195
- Andrei Panferov and Dan Alistarh},
196
  year = {2026},
197
  }
198
  ```
 
122
  "daslab-testing/CloverLM",
123
  trust_remote_code=True,
124
  dtype="bfloat16",
125
+ quartet_2_impl="quartet2", # native NVFP4 kernel or "pseudoquant" on non-Blackwell GPUs
126
  ).to("cuda") # for GPU usage or "cpu" for CPU usage
127
 
128
  tokenizer = AutoTokenizer.from_pretrained(
 
134
  output = model.generate(input_ids.to(model.device), max_new_tokens=32)
135
  print(tokenizer.decode(output[0]))
136
  ```
 
137
 
138
  ### Running Evaluations
139
 
 
164
  - PyTorch 2.10+ with CUDA 13.0
165
  - `transformers ≥ 5.3.0`
166
  - `tokenmonster ≥ 1.1.12`
167
+ - [Quartet II kernels](https://github.com/IST-DASLab/Quartet-II) (for native FP4; `pseudoquant` mode works without them)
168
 
169
  ## Architecture Details
170
 
 
190
  @article{cloverlm2026,
191
  title = {Speedrunning GPT3: Pretraining an OPT-175B-Quality Model Cheaply
192
  by Leveraging Native NVFP4},
193
+ author = {Erik Schultheis and Matin Ansaripour and Andrei Panferov and
194
+ Georgios Vlassis and Dan Alistarh},
195
  year = {2026},
196
  }
197
  ```
config.json CHANGED
@@ -13,14 +13,24 @@
13
  },
14
  "d_head": 128,
15
  "heads": 28,
 
 
16
  "max_context": 1024,
 
17
  "model_type": "cloverlm",
 
18
  "num_blocks": 29,
19
  "num_hidden_layers": 29,
 
20
  "quartet_2_impl": "pseudoquant",
21
  "ratio": 4,
22
  "scale_type": "1/sqrt(d)",
 
 
23
  "transformers_version": "5.3.0",
24
  "vocab_size": 32000,
25
- "weight_tying": true
 
 
 
26
  }
 
13
  },
14
  "d_head": 128,
15
  "heads": 28,
16
+ "hidden_size": 3584,
17
+ "intermediate_size": 14336,
18
  "max_context": 1024,
19
+ "max_position_embeddings": 1024,
20
  "model_type": "cloverlm",
21
+ "num_attention_heads": 28,
22
  "num_blocks": 29,
23
  "num_hidden_layers": 29,
24
+ "num_key_value_heads": 7,
25
  "quartet_2_impl": "pseudoquant",
26
  "ratio": 4,
27
  "scale_type": "1/sqrt(d)",
28
+ "head_dim": 128,
29
+ "tie_word_embeddings": true,
30
  "transformers_version": "5.3.0",
31
  "vocab_size": 32000,
32
+ "weight_tying": true,
33
+ "quantization_config": {
34
+ "quant_method": "quartet2"
35
+ }
36
  }
configuration_cloverlm.py CHANGED
@@ -16,6 +16,14 @@ class CloverLMConfig(PretrainedConfig):
16
  quartet_2_impl="pseudoquant",
17
  weight_tying=True,
18
  attn_backend="pytorch",
 
 
 
 
 
 
 
 
19
  **kwargs,
20
  ):
21
  self.num_blocks = num_blocks
@@ -28,4 +36,35 @@ class CloverLMConfig(PretrainedConfig):
28
  self.quartet_2_impl = quartet_2_impl
29
  self.weight_tying = weight_tying
30
  self.attn_backend = attn_backend
31
- super().__init__(vocab_size=vocab_size, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  quartet_2_impl="pseudoquant",
17
  weight_tying=True,
18
  attn_backend="pytorch",
19
+ # Optional: HuggingFace / vLLM tooling (defaults derived from shape)
20
+ hidden_size=None,
21
+ intermediate_size=None,
22
+ max_position_embeddings=None,
23
+ num_attention_heads=None,
24
+ num_key_value_heads=None,
25
+ head_dim=None,
26
+ quantization_config=None,
27
  **kwargs,
28
  ):
29
  self.num_blocks = num_blocks
 
36
  self.quartet_2_impl = quartet_2_impl
37
  self.weight_tying = weight_tying
38
  self.attn_backend = attn_backend
39
+
40
+ d_model = heads * d_head
41
+ self.hidden_size = hidden_size if hidden_size is not None else d_model
42
+ self.intermediate_size = (
43
+ intermediate_size if intermediate_size is not None else 4 * d_model
44
+ )
45
+ self.max_position_embeddings = (
46
+ max_position_embeddings
47
+ if max_position_embeddings is not None
48
+ else max_context
49
+ )
50
+ self.num_attention_heads = (
51
+ num_attention_heads if num_attention_heads is not None else heads
52
+ )
53
+ self.num_key_value_heads = (
54
+ num_key_value_heads
55
+ if num_key_value_heads is not None
56
+ else heads // ratio
57
+ )
58
+ self.head_dim = head_dim if head_dim is not None else d_head
59
+ self.quantization_config = (
60
+ quantization_config
61
+ if quantization_config is not None
62
+ else {"quant_method": "quartet2"}
63
+ )
64
+
65
+ kwargs.pop("tie_word_embeddings", None)
66
+ super().__init__(
67
+ vocab_size=vocab_size,
68
+ tie_word_embeddings=weight_tying,
69
+ **kwargs,
70
+ )
lm_eval/test_eval.log ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/919 [00:00<?, ?it/s]
1
  10%|▉ | 91/919 [00:00<00:00, 900.62it/s]2026-03-19:14:15:45 INFO [tasks:700] Selected tasks:
 
 
 
 
 
 
 
 
 
 
2
  20%|█▉ | 182/919 [00:00<00:00, 905.61it/s]
3
  0%| | 0/919 [00:00<?, ?it/s]
4
  30%|██▉ | 274/919 [00:00<00:00, 910.65it/s]
5
  10%|▉ | 91/919 [00:00<00:00, 907.36it/s]
6
  40%|███▉ | 366/919 [00:00<00:00, 911.36it/s]
7
  20%|█▉ | 183/919 [00:00<00:00, 911.26it/s]
8
  50%|████▉ | 458/919 [00:00<00:00, 911.45it/s]
9
  30%|██▉ | 275/919 [00:00<00:00, 915.08it/s]
10
  60%|█████▉ | 550/919 [00:00<00:00, 911.49it/s]
11
  40%|███▉ | 367/919 [00:00<00:00, 915.11it/s]
12
  70%|██████▉ | 642/919 [00:00<00:00, 912.08it/s]
13
  50%|████▉ | 459/919 [00:00<00:00, 916.54it/s]
14
  80%|███████▉ | 734/919 [00:00<00:00, 911.69it/s]
15
  60%|█████▉ | 551/919 [00:00<00:00, 914.63it/s]
16
  90%|████████▉ | 826/919 [00:00<00:00, 912.45it/s]
17
  70%|██████▉ | 643/919 [00:00<00:00, 914.29it/s]
 
18
  80%|███████▉ | 735/919 [00:00<00:00, 912.73it/s]
19
  90%|████████▉ | 827/919 [00:00<00:00, 913.03it/s]
 
 
 
20
  0%| | 0/5021 [00:00<?, ?it/s]
21
  6%|▌ | 278/5021 [00:00<00:01, 2773.97it/s]
22
  11%|█ | 564/5021 [00:00<00:01, 2824.23it/s]
23
  0%| | 0/5021 [00:00<?, ?it/s]
24
  17%|█▋ | 852/5021 [00:00<00:01, 2845.58it/s]
25
  4%|▎ | 178/5021 [00:00<00:02, 1779.16it/s]
26
  23%|██▎ | 1140/5021 [00:00<00:01, 2855.94it/s]
27
  7%|▋ | 362/5021 [00:00<00:02, 1813.26it/s]
28
  28%|██▊ | 1427/5021 [00:00<00:01, 2860.60it/s]
29
  11%|█ | 547/5021 [00:00<00:02, 1826.42it/s]
30
  34%|███▍ | 1714/5021 [00:00<00:01, 2863.10it/s]
31
  15%|█▍ | 730/5021 [00:00<00:02, 1826.29it/s]
32
  40%|███▉ | 2001/5021 [00:00<00:01, 2863.11it/s]
33
  18%|█▊ | 913/5021 [00:00<00:02, 1824.48it/s]
34
  46%|████▌ | 2289/5021 [00:00<00:00, 2868.32it/s]
35
  22%|██▏ | 1096/5021 [00:00<00:02, 1819.99it/s]
36
  51%|█████▏ | 2577/5021 [00:00<00:00, 2871.27it/s]
37
  25%|██▌ | 1279/5021 [00:00<00:02, 1815.56it/s]
38
  29%|██▉ | 1461/5021 [00:00<00:01, 1802.42it/s]
39
  57%|█████▋ | 2865/5021 [00:01<00:01, 1693.34it/s]
40
  63%|██████▎ | 3149/5021 [00:01<00:00, 1928.96it/s]
41
  68%|██████▊ | 3433/5021 [00:01<00:00, 2135.30it/s]
42
  33%|███▎ | 1642/5021 [00:01<00:03, 1001.41it/s]
43
  74%|███████▍ | 3718/5021 [00:01<00:00, 2308.62it/s]
44
  36%|███▋ | 1823/5021 [00:01<00:02, 1159.85it/s]
45
  80%|███████▉ | 4004/5021 [00:01<00:00, 2449.72it/s]
46
  40%|███▉ | 2002/5021 [00:01<00:02, 1297.94it/s]
47
  85%|████████▌ | 4290/5021 [00:01<00:00, 2558.71it/s]
48
  43%|████▎ | 2183/5021 [00:01<00:02, 1418.79it/s]
49
  91%|█████████ | 4577/5021 [00:01<00:00, 2643.21it/s]
50
  47%|████▋ | 2364/5021 [00:01<00:01, 1516.12it/s]
51
  97%|█████████▋| 4864/5021 [00:01<00:00, 2705.33it/s]
52
  51%|█████ | 2544/5021 [00:01<00:01, 1591.38it/s]
 
53
  54%|█████▍ | 2725/5021 [00:01<00:01, 1650.39it/s]
54
  58%|█████▊ | 2905/5021 [00:01<00:01, 1692.57it/s]
55
  61%|██████▏ | 3085/5021 [00:01<00:01, 1722.37it/s]
56
  65%|██████▌ | 3266/5021 [00:02<00:01, 1745.32it/s]
57
  69%|██████▊ | 3447/5021 [00:02<00:00, 1762.66it/s]
58
  72%|███████▏ | 3627/5021 [00:02<00:00, 1772.66it/s]
59
  76%|███████▌ | 3807/5021 [00:02<00:00, 1779.65it/s]
60
  79%|███████▉ | 3987/5021 [00:02<00:00, 1781.31it/s]
61
  83%|████████▎ | 4168/5021 [00:02<00:00, 1787.31it/s]
62
  87%|████████▋ | 4349/5021 [00:02<00:00, 1792.50it/s]
63
  90%|█████████ | 4529/5021 [00:02<00:00, 1790.62it/s]
64
  94%|█████████▍| 4709/5021 [00:02<00:00, 1790.61it/s]
65
  97%|█████████▋| 4890/5021 [00:02<00:00, 1793.65it/s]
 
 
 
66
  0%| | 0/586 [00:00<?, ?it/s]
67
  0%| | 0/586 [00:00<?, ?it/s]
68
  17%|█▋ | 98/586 [00:00<00:00, 971.82it/s]
69
  11%|█ | 62/586 [00:00<00:00, 613.37it/s]
70
  34%|███▎ | 197/586 [00:00<00:00, 976.00it/s]
71
  21%|██▏ | 125/586 [00:00<00:00, 619.71it/s]
72
  51%|█████ | 297/586 [00:00<00:00, 985.47it/s]
73
  32%|███▏ | 189/586 [00:00<00:00, 624.63it/s]
74
  68%|██████▊ | 396/586 [00:00<00:00, 986.66it/s]
75
  43%|████▎ | 253/586 [00:00<00:00, 627.18it/s]
76
  85%|████████▍ | 496/586 [00:00<00:00, 989.81it/s]
77
  54%|█████▍ | 317/586 [00:00<00:00, 628.58it/s]
 
78
  65%|██████▌ | 381/586 [00:00<00:00, 629.34it/s]
79
  76%|███████▌ | 445/586 [00:00<00:00, 630.51it/s]
80
  87%|████████▋ | 509/586 [00:00<00:00, 632.25it/s]
81
  98%|█████████▊| 573/586 [00:00<00:00, 632.82it/s]
 
 
 
82
  0%| | 0/1188 [00:00<?, ?it/s]
83
  0%| | 0/1188 [00:00<?, ?it/s]
84
  8%|▊ | 100/1188 [00:00<00:01, 993.05it/s]
85
  5%|▌ | 63/1188 [00:00<00:01, 626.46it/s]
86
  17%|█▋ | 200/1188 [00:00<00:00, 991.06it/s]
87
  11%|█ | 127/1188 [00:00<00:01, 629.71it/s]
88
  25%|██▌ | 300/1188 [00:00<00:00, 994.71it/s]
89
  16%|█▌ | 191/1188 [00:00<00:01, 630.93it/s]
90
  34%|███▎ | 400/1188 [00:00<00:00, 993.35it/s]
91
  21%|██▏ | 255/1188 [00:00<00:01, 633.00it/s]
92
  42%|████▏ | 500/1188 [00:00<00:00, 993.59it/s]
93
  27%|██▋ | 319/1188 [00:00<00:01, 634.81it/s]
94
  51%|█████ | 600/1188 [00:00<00:00, 993.86it/s]
95
  32%|███▏ | 383/1188 [00:00<00:01, 636.25it/s]
96
  59%|█████▉ | 700/1188 [00:00<00:00, 992.08it/s]
97
  38%|███▊ | 447/1188 [00:00<00:01, 636.11it/s]
98
  67%|██████▋ | 800/1188 [00:00<00:00, 988.12it/s]
99
  43%|████▎ | 511/1188 [00:00<00:01, 634.75it/s]
100
  76%|███████▌ | 899/1188 [00:00<00:00, 988.66it/s]
101
  48%|████▊ | 575/1188 [00:00<00:00, 634.67it/s]
102
  84%|████████▍ | 999/1188 [00:01<00:00, 991.15it/s]
103
  54%|█████▍ | 639/1188 [00:01<00:00, 633.38it/s]
104
  93%|█████████▎| 1099/1188 [00:01<00:00, 993.44it/s]
105
  59%|█████▉ | 703/1188 [00:01<00:00, 633.82it/s]
 
106
  65%|██████▍ | 768/1188 [00:01<00:00, 635.72it/s]
107
  70%|███████ | 832/1188 [00:01<00:00, 636.49it/s]
108
  75%|███████▌ | 896/1188 [00:01<00:00, 632.00it/s]
109
  81%|████████ | 960/1188 [00:01<00:00, 628.76it/s]
110
  86%|████████▌ | 1023/1188 [00:01<00:00, 624.90it/s]
111
  91%|█████████▏| 1086/1188 [00:01<00:00, 625.37it/s]
112
  97%|█████████▋| 1149/1188 [00:01<00:00, 625.85it/s]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The following values were not passed to `accelerate launch` and had defaults used instead:
2
+ `--num_processes` was set to a value of `2`
3
+ More than one GPU was found, enabling multi-GPU training.
4
+ If this was unintended please pass in `--num_processes=1`.
5
+ `--num_machines` was set to a value of `1`
6
+ `--mixed_precision` was set to a value of `'no'`
7
+ `--dynamo_backend` was set to a value of `'no'`
8
+ To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
9
+ 2026-03-19:14:15:25 INFO [_cli.run:375] Including path: ./
10
+ 2026-03-19:14:15:25 INFO [_cli.run:376] Selected Tasks: ['arc_easy_mi', 'arc_challenge_mi', 'hellaswag', 'piqa']
11
+ 2026-03-19:14:15:25 INFO [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
12
+ 2026-03-19:14:15:25 INFO [evaluator:236] Initializing cloverlm model, with arguments: {'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch', 'trust_remote_code': True}
13
+ 2026-03-19:14:15:25 INFO [_cli.run:375] Including path: ./
14
+ 2026-03-19:14:15:25 INFO [_cli.run:376] Selected Tasks: ['arc_easy_mi', 'arc_challenge_mi', 'hellaswag', 'piqa']
15
+ 2026-03-19:14:15:25 INFO [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
16
+ 2026-03-19:14:15:25 INFO [evaluator:236] Initializing cloverlm model, with arguments: {'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch', 'trust_remote_code': True}
17
+ 2026-03-19:14:15:26 INFO [models.huggingface:178] Using `accelerate launch` or `parallelize=True`, device 'cuda:0' will be overridden when placing model.
18
+ 2026-03-19:14:15:26 INFO [models.huggingface:178] Using `accelerate launch` or `parallelize=True`, device 'cuda:0' will be overridden when placing model.
19
+ 2026-03-19:14:15:26 INFO [models.huggingface:548] Model type cannot be determined. Using default model type 'causal'
20
+ 2026-03-19:14:15:26 INFO [models.huggingface:548] Model type cannot be determined. Using default model type 'causal'
21
+ 2026-03-19:14:15:28 INFO [models.huggingface:423] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:1'}
22
+ 2026-03-19:14:15:28 INFO [models.huggingface:423] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
23
+
24
+
25
+ The tied weights mapping and config for this model specifies to tie transformer.emb.weight to transformer.linear.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
26
+ The tied weights mapping and config for this model specifies to tie transformer.emb.weight to transformer.linear.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
27
+ 2026-03-19:14:15:45 INFO [tasks:700] Selected tasks:
28
+ 2026-03-19:14:15:45 INFO [tasks:691] Task: piqa (.venv/lib/python3.11/site-packages/lm_eval/tasks/piqa/piqa.yaml)
29
+ 2026-03-19:14:15:45 INFO [tasks:691] Task: hellaswag (.venv/lib/python3.11/site-packages/lm_eval/tasks/hellaswag/hellaswag.yaml)
30
+ 2026-03-19:14:15:45 INFO [tasks:691] Task: arc_challenge_mi (arc_challenge.yaml)
31
+ 2026-03-19:14:15:45 INFO [tasks:691] Task: arc_easy_mi (arc_easy_mi.yaml)
32
+ 2026-03-19:14:15:45 WARNING [evaluator:333] Overwriting default num_fewshot of piqa from None to 0
33
+ 2026-03-19:14:15:45 WARNING [evaluator:333] Overwriting default num_fewshot of hellaswag from None to 0
34
+ 2026-03-19:14:15:45 WARNING [evaluator:333] Overwriting default num_fewshot of arc_challenge_mi from None to 0
35
+ 2026-03-19:14:15:45 WARNING [evaluator:333] Overwriting default num_fewshot of arc_easy_mi from None to 0
36
+ 2026-03-19:14:15:45 INFO [api.task:311] Building contexts for piqa on rank 0...
37
+
38
  0%| | 0/919 [00:00<?, ?it/s]
39
  10%|▉ | 91/919 [00:00<00:00, 900.62it/s]2026-03-19:14:15:45 INFO [tasks:700] Selected tasks:
40
+ 2026-03-19:14:15:45 INFO [tasks:691] Task: piqa (.venv/lib/python3.11/site-packages/lm_eval/tasks/piqa/piqa.yaml)
41
+ 2026-03-19:14:15:45 INFO [tasks:691] Task: hellaswag (.venv/lib/python3.11/site-packages/lm_eval/tasks/hellaswag/hellaswag.yaml)
42
+ 2026-03-19:14:15:45 INFO [tasks:691] Task: arc_challenge_mi (arc_challenge.yaml)
43
+ 2026-03-19:14:15:45 INFO [tasks:691] Task: arc_easy_mi (arc_easy_mi.yaml)
44
+ 2026-03-19:14:15:45 WARNING [evaluator:333] Overwriting default num_fewshot of piqa from None to 0
45
+ 2026-03-19:14:15:45 WARNING [evaluator:333] Overwriting default num_fewshot of hellaswag from None to 0
46
+ 2026-03-19:14:15:45 WARNING [evaluator:333] Overwriting default num_fewshot of arc_challenge_mi from None to 0
47
+ 2026-03-19:14:15:45 WARNING [evaluator:333] Overwriting default num_fewshot of arc_easy_mi from None to 0
48
+ 2026-03-19:14:15:45 INFO [api.task:311] Building contexts for piqa on rank 1...
49
+
50
  20%|█▉ | 182/919 [00:00<00:00, 905.61it/s]
51
  0%| | 0/919 [00:00<?, ?it/s]
52
  30%|██▉ | 274/919 [00:00<00:00, 910.65it/s]
53
  10%|▉ | 91/919 [00:00<00:00, 907.36it/s]
54
  40%|███▉ | 366/919 [00:00<00:00, 911.36it/s]
55
  20%|█▉ | 183/919 [00:00<00:00, 911.26it/s]
56
  50%|████▉ | 458/919 [00:00<00:00, 911.45it/s]
57
  30%|██▉ | 275/919 [00:00<00:00, 915.08it/s]
58
  60%|█████▉ | 550/919 [00:00<00:00, 911.49it/s]
59
  40%|███▉ | 367/919 [00:00<00:00, 915.11it/s]
60
  70%|██████▉ | 642/919 [00:00<00:00, 912.08it/s]
61
  50%|████▉ | 459/919 [00:00<00:00, 916.54it/s]
62
  80%|███████▉ | 734/919 [00:00<00:00, 911.69it/s]
63
  60%|█████▉ | 551/919 [00:00<00:00, 914.63it/s]
64
  90%|████████▉ | 826/919 [00:00<00:00, 912.45it/s]
65
  70%|██████▉ | 643/919 [00:00<00:00, 914.29it/s]
66
+
67
  80%|███████▉ | 735/919 [00:00<00:00, 912.73it/s]
68
  90%|████████▉ | 827/919 [00:00<00:00, 913.03it/s]
69
+ 2026-03-19:14:15:48 INFO [api.task:311] Building contexts for hellaswag on rank 0...
70
+ 2026-03-19:14:15:48 INFO [api.task:311] Building contexts for hellaswag on rank 1...
71
+
72
  0%| | 0/5021 [00:00<?, ?it/s]
73
  6%|▌ | 278/5021 [00:00<00:01, 2773.97it/s]
74
  11%|█ | 564/5021 [00:00<00:01, 2824.23it/s]
75
  0%| | 0/5021 [00:00<?, ?it/s]
76
  17%|█▋ | 852/5021 [00:00<00:01, 2845.58it/s]
77
  4%|▎ | 178/5021 [00:00<00:02, 1779.16it/s]
78
  23%|██▎ | 1140/5021 [00:00<00:01, 2855.94it/s]
79
  7%|▋ | 362/5021 [00:00<00:02, 1813.26it/s]
80
  28%|██▊ | 1427/5021 [00:00<00:01, 2860.60it/s]
81
  11%|█ | 547/5021 [00:00<00:02, 1826.42it/s]
82
  34%|███▍ | 1714/5021 [00:00<00:01, 2863.10it/s]
83
  15%|█▍ | 730/5021 [00:00<00:02, 1826.29it/s]
84
  40%|███▉ | 2001/5021 [00:00<00:01, 2863.11it/s]
85
  18%|█▊ | 913/5021 [00:00<00:02, 1824.48it/s]
86
  46%|████▌ | 2289/5021 [00:00<00:00, 2868.32it/s]
87
  22%|██▏ | 1096/5021 [00:00<00:02, 1819.99it/s]
88
  51%|█████▏ | 2577/5021 [00:00<00:00, 2871.27it/s]
89
  25%|██▌ | 1279/5021 [00:00<00:02, 1815.56it/s]
90
  29%|██▉ | 1461/5021 [00:00<00:01, 1802.42it/s]
91
  57%|█████▋ | 2865/5021 [00:01<00:01, 1693.34it/s]
92
  63%|██████▎ | 3149/5021 [00:01<00:00, 1928.96it/s]
93
  68%|██████▊ | 3433/5021 [00:01<00:00, 2135.30it/s]
94
  33%|███▎ | 1642/5021 [00:01<00:03, 1001.41it/s]
95
  74%|███████▍ | 3718/5021 [00:01<00:00, 2308.62it/s]
96
  36%|███▋ | 1823/5021 [00:01<00:02, 1159.85it/s]
97
  80%|███████▉ | 4004/5021 [00:01<00:00, 2449.72it/s]
98
  40%|███▉ | 2002/5021 [00:01<00:02, 1297.94it/s]
99
  85%|████████▌ | 4290/5021 [00:01<00:00, 2558.71it/s]
100
  43%|████▎ | 2183/5021 [00:01<00:02, 1418.79it/s]
101
  91%|█████████ | 4577/5021 [00:01<00:00, 2643.21it/s]
102
  47%|████▋ | 2364/5021 [00:01<00:01, 1516.12it/s]
103
  97%|█████████▋| 4864/5021 [00:01<00:00, 2705.33it/s]
104
  51%|█████ | 2544/5021 [00:01<00:01, 1591.38it/s]
105
+
106
  54%|█████▍ | 2725/5021 [00:01<00:01, 1650.39it/s]
107
  58%|█████▊ | 2905/5021 [00:01<00:01, 1692.57it/s]
108
  61%|██████▏ | 3085/5021 [00:01<00:01, 1722.37it/s]
109
  65%|██████▌ | 3266/5021 [00:02<00:01, 1745.32it/s]
110
  69%|██████▊ | 3447/5021 [00:02<00:00, 1762.66it/s]
111
  72%|███████▏ | 3627/5021 [00:02<00:00, 1772.66it/s]
112
  76%|███████▌ | 3807/5021 [00:02<00:00, 1779.65it/s]
113
  79%|███████▉ | 3987/5021 [00:02<00:00, 1781.31it/s]
114
  83%|████████▎ | 4168/5021 [00:02<00:00, 1787.31it/s]
115
  87%|████████▋ | 4349/5021 [00:02<00:00, 1792.50it/s]
116
  90%|█████████ | 4529/5021 [00:02<00:00, 1790.62it/s]
117
  94%|█████████▍| 4709/5021 [00:02<00:00, 1790.61it/s]
118
  97%|█████████▋| 4890/5021 [00:02<00:00, 1793.65it/s]
119
+ 2026-03-19:14:15:52 INFO [api.task:311] Building contexts for arc_challenge_mi on rank 1...
120
+ 2026-03-19:14:15:52 INFO [api.task:311] Building contexts for arc_challenge_mi on rank 0...
121
+
122
  0%| | 0/586 [00:00<?, ?it/s]
123
  0%| | 0/586 [00:00<?, ?it/s]
124
  17%|█▋ | 98/586 [00:00<00:00, 971.82it/s]
125
  11%|█ | 62/586 [00:00<00:00, 613.37it/s]
126
  34%|███▎ | 197/586 [00:00<00:00, 976.00it/s]
127
  21%|██▏ | 125/586 [00:00<00:00, 619.71it/s]
128
  51%|█████ | 297/586 [00:00<00:00, 985.47it/s]
129
  32%|███▏ | 189/586 [00:00<00:00, 624.63it/s]
130
  68%|██████▊ | 396/586 [00:00<00:00, 986.66it/s]
131
  43%|████▎ | 253/586 [00:00<00:00, 627.18it/s]
132
  85%|████████▍ | 496/586 [00:00<00:00, 989.81it/s]
133
  54%|█████▍ | 317/586 [00:00<00:00, 628.58it/s]
134
+
135
  65%|██████▌ | 381/586 [00:00<00:00, 629.34it/s]
136
  76%|███████▌ | 445/586 [00:00<00:00, 630.51it/s]
137
  87%|████████▋ | 509/586 [00:00<00:00, 632.25it/s]
138
  98%|█████████▊| 573/586 [00:00<00:00, 632.82it/s]
139
+ 2026-03-19:14:15:53 INFO [api.task:311] Building contexts for arc_easy_mi on rank 0...
140
+ 2026-03-19:14:15:53 INFO [api.task:311] Building contexts for arc_easy_mi on rank 1...
141
+
142
  0%| | 0/1188 [00:00<?, ?it/s]
143
  0%| | 0/1188 [00:00<?, ?it/s]
144
  8%|▊ | 100/1188 [00:00<00:01, 993.05it/s]
145
  5%|▌ | 63/1188 [00:00<00:01, 626.46it/s]
146
  17%|█▋ | 200/1188 [00:00<00:00, 991.06it/s]
147
  11%|█ | 127/1188 [00:00<00:01, 629.71it/s]
148
  25%|██▌ | 300/1188 [00:00<00:00, 994.71it/s]
149
  16%|█▌ | 191/1188 [00:00<00:01, 630.93it/s]
150
  34%|███▎ | 400/1188 [00:00<00:00, 993.35it/s]
151
  21%|██▏ | 255/1188 [00:00<00:01, 633.00it/s]
152
  42%|████▏ | 500/1188 [00:00<00:00, 993.59it/s]
153
  27%|██▋ | 319/1188 [00:00<00:01, 634.81it/s]
154
  51%|█████ | 600/1188 [00:00<00:00, 993.86it/s]
155
  32%|███▏ | 383/1188 [00:00<00:01, 636.25it/s]
156
  59%|█████▉ | 700/1188 [00:00<00:00, 992.08it/s]
157
  38%|███▊ | 447/1188 [00:00<00:01, 636.11it/s]
158
  67%|██████▋ | 800/1188 [00:00<00:00, 988.12it/s]
159
  43%|████▎ | 511/1188 [00:00<00:01, 634.75it/s]
160
  76%|███████▌ | 899/1188 [00:00<00:00, 988.66it/s]
161
  48%|████▊ | 575/1188 [00:00<00:00, 634.67it/s]
162
  84%|████████▍ | 999/1188 [00:01<00:00, 991.15it/s]
163
  54%|█████▍ | 639/1188 [00:01<00:00, 633.38it/s]
164
  93%|█████████▎| 1099/1188 [00:01<00:00, 993.44it/s]
165
  59%|█████▉ | 703/1188 [00:01<00:00, 633.82it/s]
166
+
167
  65%|██████▍ | 768/1188 [00:01<00:00, 635.72it/s]
168
  70%|███████ | 832/1188 [00:01<00:00, 636.49it/s]
169
  75%|███████▌ | 896/1188 [00:01<00:00, 632.00it/s]
170
  81%|████████ | 960/1188 [00:01<00:00, 628.76it/s]
171
  86%|████████▌ | 1023/1188 [00:01<00:00, 624.90it/s]
172
  91%|█████████▏| 1086/1188 [00:01<00:00, 625.37it/s]
173
  97%|█████████▋| 1149/1188 [00:01<00:00, 625.85it/s]
174
+ 2026-03-19:14:15:55 INFO [evaluator:584] Running loglikelihood requests
175
+ 2026-03-19:14:15:55 INFO [evaluator:584] Running loglikelihood requests
176
+
177
+ Passed argument batch_size = auto:1. Detecting largest batch size
178
+ Determined largest batch size: 64
179
+ Determined largest batch size: 64
180
+
181
+ [rank1]:W0319 14:20:19.946000 1458598 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] function: 'abs_max' (/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/quartet2/linear.py:147)
182
+ [rank1]:W0319 14:20:19.946000 1458598 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] last reason: 0/5: tensor 'x' requires_grad mismatch. expected requires_grad=1
183
+ [rank1]:W0319 14:20:19.946000 1458598 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
184
+ [rank1]:W0319 14:20:19.946000 1458598 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/compile/programming_model.recompilation.html
185
+
186
+ [rank0]:W0319 14:20:22.844000 1458597 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] function: 'abs_max' (/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/quartet2/linear.py:147)
187
+ [rank0]:W0319 14:20:22.844000 1458597 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] last reason: 0/5: tensor 'x' requires_grad mismatch. expected requires_grad=1
188
+ [rank0]:W0319 14:20:22.844000 1458597 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
189
+ [rank0]:W0319 14:20:22.844000 1458597 .venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py:1676] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/compile/programming_model.recompilation.html
190
+
191
+ fatal: not a git repository (or any of the parent directories): .git
192
+ 2026-03-19:14:20:29 INFO [loggers.evaluation_tracker:316] Output path not provided, skipping saving results aggregated
193
+ cloverlm ({'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch'}), gen_kwargs: ({}), limit: None, num_fewshot: 0, batch_size: auto (64)
194
+ | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
195
+ |----------------|------:|------|-----:|---------------|---|-----:|---|-----:|
196
+ |arc_challenge_mi| 1|none | 0|acc |↑ |0.4642|± |0.0146|
197
+ | | |none | 0|acc_mutual_info|↑ |0.5017|± |0.0146|
198
+ | | |none | 0|acc_norm |↑ |0.4940|± |0.0146|
199
+ |arc_easy_mi | 1|none | 0|acc |↑ |0.8005|± |0.0082|
200
+ | | |none | 0|acc_mutual_info|↑ |0.7193|± |0.0092|
201
+ | | |none | 0|acc_norm |↑ |0.7740|± |0.0086|
202
+ |hellaswag | 1|none | 0|acc |↑ |0.5392|± |0.0050|
203
+ | | |none | 0|acc_norm |↑ |0.7169|± |0.0045|
204
+ |piqa | 1|none | 0|acc |↑ |0.7911|± |0.0095|
205
+ | | |none | 0|acc_norm |↑ |0.8090|± |0.0092|
206
+
207
+ [rank0]:[W319 14:20:30.213375773 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
lm_eval/test_eval2.log ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The following values were not passed to `accelerate launch` and had defaults used instead:
2
+ `--num_processes` was set to a value of `2`
3
+ More than one GPU was found, enabling multi-GPU training.
4
+ If this was unintended please pass in `--num_processes=1`.
5
+ `--num_machines` was set to a value of `1`
6
+ `--mixed_precision` was set to a value of `'no'`
7
+ `--dynamo_backend` was set to a value of `'no'`
8
+ To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
9
+ 2026-03-19:16:52:56 INFO [_cli.run:375] Including path: ./
10
+ 2026-03-19:16:52:56 INFO [_cli.run:376] Selected Tasks: ['arc_easy_mi', 'arc_challenge_mi', 'hellaswag', 'piqa']
11
+ 2026-03-19:16:52:56 INFO [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
12
+ 2026-03-19:16:52:56 INFO [evaluator:236] Initializing cloverlm model, with arguments: {'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch', 'trust_remote_code': True}
13
+ 2026-03-19:16:52:56 INFO [models.huggingface:178] Using `accelerate launch` or `parallelize=True`, device 'cuda:0' will be overridden when placing model.
14
+ 2026-03-19:16:52:56 INFO [_cli.run:375] Including path: ./
15
+ 2026-03-19:16:52:56 INFO [_cli.run:376] Selected Tasks: ['arc_easy_mi', 'arc_challenge_mi', 'hellaswag', 'piqa']
16
+ 2026-03-19:16:52:56 INFO [evaluator:211] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
17
+ 2026-03-19:16:52:56 INFO [evaluator:236] Initializing cloverlm model, with arguments: {'pretrained': 'daslab-testing/CloverLM', 'dtype': 'bfloat16', 'quartet_2_impl': 'quartet2', 'attn_backend': 'pytorch', 'trust_remote_code': True}
18
+ 2026-03-19:16:52:57 INFO [models.huggingface:178] Using `accelerate launch` or `parallelize=True`, device 'cuda:0' will be overridden when placing model.
19
+ Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
20
+ 2026-03-19:16:52:57 INFO [models.huggingface:548] Model type cannot be determined. Using default model type 'causal'
21
+ 2026-03-19:16:52:57 INFO [models.huggingface:548] Model type cannot be determined. Using default model type 'causal'
22
+ Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
23
+ [rank1]: Traceback (most recent call last):
24
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/eval.py", line 65, in <module>
25
+ [rank1]: cli_evaluate()
26
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/__main__.py", line 10, in cli_evaluate
27
+ [rank1]: parser.execute(args)
28
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/_cli/harness.py", line 60, in execute
29
+ [rank1]: args.func(args)
30
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/_cli/run.py", line 379, in _execute
31
+ [rank1]: results = simple_evaluate(
32
+ [rank1]: ^^^^^^^^^^^^^^^^
33
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/utils.py", line 498, in _wrapper
34
+ [rank1]: return fn(*args, **kwargs)
35
+ [rank1]: ^^^^^^^^^^^^^^^^^^^
36
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/evaluator.py", line 239, in simple_evaluate
37
+ [rank1]: lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
38
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
39
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/api/model.py", line 180, in create_from_arg_obj
40
+ [rank1]: return cls(**arg_dict, **additional_config)
41
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
42
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/eval.py", line 11, in __init__
43
+ [rank1]: super().__init__(**kwargs)
44
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/models/huggingface.py", line 204, in __init__
45
+ [rank1]: self._create_tokenizer(
46
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/models/huggingface.py", line 793, in _create_tokenizer
47
+ [rank1]: self.tokenizer = transformers.AutoTokenizer.from_pretrained(
48
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
49
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 732, in from_pretrained
50
+ [rank1]: tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
51
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
52
+ [rank1]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/transformers/dynamic_module_utils.py", line 567, in get_class_from_dynamic_module
53
+ [rank1]: module_file, class_name = class_reference.split(".")
54
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^
55
+ [rank1]: ValueError: not enough values to unpack (expected 2, got 1)
56
+ [rank0]: Traceback (most recent call last):
57
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/eval.py", line 65, in <module>
58
+ [rank0]: cli_evaluate()
59
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/__main__.py", line 10, in cli_evaluate
60
+ [rank0]: parser.execute(args)
61
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/_cli/harness.py", line 60, in execute
62
+ [rank0]: args.func(args)
63
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/_cli/run.py", line 379, in _execute
64
+ [rank0]: results = simple_evaluate(
65
+ [rank0]: ^^^^^^^^^^^^^^^^
66
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/utils.py", line 498, in _wrapper
67
+ [rank0]: return fn(*args, **kwargs)
68
+ [rank0]: ^^^^^^^^^^^^^^^^^^^
69
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/evaluator.py", line 239, in simple_evaluate
70
+ [rank0]: lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
71
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
72
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/api/model.py", line 180, in create_from_arg_obj
73
+ [rank0]: return cls(**arg_dict, **additional_config)
74
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
75
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/eval.py", line 11, in __init__
76
+ [rank0]: super().__init__(**kwargs)
77
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/models/huggingface.py", line 204, in __init__
78
+ [rank0]: self._create_tokenizer(
79
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/lm_eval/models/huggingface.py", line 793, in _create_tokenizer
80
+ [rank0]: self.tokenizer = transformers.AutoTokenizer.from_pretrained(
81
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
82
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 732, in from_pretrained
83
+ [rank0]: tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
84
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
85
+ [rank0]: File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/transformers/dynamic_module_utils.py", line 567, in get_class_from_dynamic_module
86
+ [rank0]: module_file, class_name = class_reference.split(".")
87
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^
88
+ [rank0]: ValueError: not enough values to unpack (expected 2, got 1)
89
+ [rank0]:[W319 16:52:58.069226968 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
90
+ W0319 16:52:59.444000 1490612 torch/distributed/elastic/multiprocessing/api.py:1010] Sending process 1490848 closing signal SIGTERM
91
+ E0319 16:52:59.508000 1490612 torch/distributed/elastic/multiprocessing/api.py:984] failed (exitcode: 1) local_rank: 0 (pid: 1490847) of binary: /home/matin/convert_dir/CloverLM/lm_eval/.venv/bin/python
92
+ Traceback (most recent call last):
93
+ File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/bin/accelerate", line 10, in <module>
94
+ sys.exit(main())
95
+ ^^^^^^
96
+ File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main
97
+ args.func(args)
98
+ File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1396, in launch_command
99
+ multi_gpu_launcher(args)
100
+ File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1023, in multi_gpu_launcher
101
+ distrib_run.run(args)
102
+ File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/torch/distributed/run.py", line 982, in run
103
+ elastic_launch(
104
+ File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 170, in __call__
105
+ return launch_agent(self._config, self._entrypoint, list(args))
106
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
107
+ File "/home/matin/convert_dir/CloverLM/lm_eval/.venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 317, in launch_agent
108
+ raise ChildFailedError(
109
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
110
+ ============================================================
111
+ eval.py FAILED
112
+ ------------------------------------------------------------
113
+ Failures:
114
+ [1]:
115
+ time : 2026-03-19_16:52:59
116
+ host : b300-eval.datacrunch.io
117
+ rank : 1 (local_rank: 1)
118
+ exitcode : 1 (pid: 1490848)
119
+ error_file: <N/A>
120
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
121
+ ------------------------------------------------------------
122
+ Root Cause (first observed failure):
123
+ [0]:
124
+ time : 2026-03-19_16:52:59
125
+ host : b300-eval.datacrunch.io
126
+ rank : 0 (local_rank: 0)
127
+ exitcode : 1 (pid: 1490847)
128
+ error_file: <N/A>
129
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
130
+ ============================================================
vllm_plugin/SERVING.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Serving CloverLM with vLLM (Quartet II NVFP4)
2
+
3
+ ## Prerequisites
4
+
5
+ - NVIDIA Blackwell GPU (B300 / B200 / RTX 5090) for real Quartet II NVFP4 kernels
6
+ - CUDA 13.0+
7
+ - Python 3.11+
8
+ - The Quartet II kernels (`quartet2` package) installed
9
+
10
+ ## 1. Environment Setup
11
+
12
+ ```bash
13
+ # Activate the existing environment
14
+ source .venv/bin/activate
15
+
16
+ # Set CUDA paths
17
+ export CUDA_HOME=/usr/local/cuda-13.0/
18
+ export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
19
+ export PATH=/usr/local/cuda/bin:$PATH
20
+ export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}
21
+ ```
22
+
23
+ ## 2. Install vLLM
24
+
25
+ ```bash
26
+ export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest \
27
+ | jq -r .tag_name | sed 's/^v//')
28
+ export CUDA_VERSION=130
29
+ export CPU_ARCH=$(uname -m)
30
+
31
+ uv pip install \
32
+ "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_35_${CPU_ARCH}.whl" \
33
+ --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
34
+ ```
35
+
36
+ ## 3. Serve the Model
37
+
38
+ ### Offline inference (quick test)
39
+
40
+ ```bash
41
+ cd /home/matin/convert_dir/CloverLM/vllm_plugin
42
+ python serve.py
43
+ ```
44
+
45
+ ### OpenAI-compatible API server
46
+
47
+ ```bash
48
+ cd /home/matin/convert_dir/CloverLM/vllm_plugin
49
+ python serve.py --api --port 8000
50
+ ```
51
+
52
+ Then query:
53
+
54
+ ```bash
55
+ curl http://localhost:8000/v1/completions \
56
+ -H "Content-Type: application/json" \
57
+ -d '{
58
+ "model": "/home/matin/convert_dir/CloverLM",
59
+ "prompt": "The capital of France is",
60
+ "max_tokens": 64,
61
+ "temperature": 0.8
62
+ }'
63
+ ```
64
+
65
+ ### Options
66
+
67
+ | Flag | Default | Description |
68
+ |------|---------|-------------|
69
+ | `--model` | `../` (CloverLM dir) | Path to CloverLM model directory |
70
+ | `--api` | off | Start OpenAI-compatible API server |
71
+ | `--port` | 8000 | API server port |
72
+ | `--host` | 0.0.0.0 | API server host |
73
+ | `--tp` | 1 | Tensor parallel size |
74
+ | `--max-model-len` | 1024 | Maximum context length |
75
+ | `--gpu-memory-utilization` | 0.9 | GPU memory fraction to use |
76
+
77
+ ## Architecture
78
+
79
+ The vLLM integration consists of three components:
80
+
81
+ 1. **`quartet2_quant.py`** -- Quartet II quantization plugin registered as `"quartet2"`.
82
+ Wraps the Quartet II on-the-fly FP4 quantization (`quant_fp4` + `flashinfer.mm_fp4`)
83
+ into vLLM's `LinearMethodBase` interface. Weights stay in bf16; quantization happens
84
+ at each forward pass.
85
+
86
+ 2. **`cloverlm_vllm.py`** -- Full vLLM model implementation with paged KV cache.
87
+ Reimplements CloverLM's architecture using vLLM primitives:
88
+ - `ColumnParallelLinear` / `RowParallelLinear` for Q/K/V/O and MLP projections
89
+ - vLLM `Attention` for paged KV caching and efficient attention
90
+ - Custom RoPE (base 1024, repeat_interleave pattern)
91
+ - Sphere normalization on Q/K before attention
92
+ - Per-head learnable scale parameter
93
+ - Squared ReLU activation in MLP
94
+ - Post-sublayer RMSNorm (not pre-norm)
95
+
96
+ 3. **`serve.py`** -- Entry point that registers both the quantization plugin and model,
97
+ then launches vLLM in offline or API mode.
98
+
99
+ ## Known Limitations
100
+
101
+ - **CUDA graphs**: Currently `enforce_eager=True` is required because the Quartet II
102
+ on-the-fly quantization kernels (`quant_fp4` + `mm_fp4`) are not compatible with
103
+ CUDA graph capture. This means slightly higher per-token latency compared to
104
+ CUDA-graph-enabled models. A future update to the Quartet II kernels could remove
105
+ this limitation.
106
+
107
+ ## Troubleshooting
108
+
109
+ **"No module named 'quartet2'"**: Ensure the Quartet II kernels are installed:
110
+ ```bash
111
+ uv pip install "quartet2 @ git+https://github.com/IST-DASLab/Quartet-II.git#subdirectory=kernels"
112
+ ```
113
+
114
+ **CUDA errors**: Make sure `CUDA_HOME` points to CUDA 13.0+ and `TRITON_PTXAS_PATH` is set.
115
+
116
+ **Out of memory**: Reduce `--gpu-memory-utilization` or use `--tp 2` for tensor parallelism.
vllm_plugin/__init__.py ADDED
File without changes
vllm_plugin/cloverlm_vllm.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from __future__ import annotations
3
+
4
+ from typing import Iterable, Optional
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+ from vllm.config import VllmConfig
11
+ from vllm.model_executor.layers.attention import Attention
12
+ from vllm.model_executor.layers.layernorm import RMSNorm
13
+ from vllm.model_executor.layers.linear import (
14
+ ColumnParallelLinear,
15
+ RowParallelLinear,
16
+ )
17
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
18
+ from vllm.model_executor.layers.quantization import QuantizationConfig
19
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
20
+ ParallelLMHead,
21
+ VocabParallelEmbedding,
22
+ )
23
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
24
+ from vllm.model_executor.models.utils import AutoWeightsLoader, WeightsMapper
25
+
26
+
27
+ def _build_rope_cos_sin(
28
+ positions: torch.Tensor,
29
+ d_head: int,
30
+ device: torch.device,
31
+ ) -> tuple[torch.Tensor, torch.Tensor]:
32
+ js = torch.arange(d_head // 2, device=device, dtype=torch.float32)
33
+ theta = 1.0 / (1024.0 ** (2.0 * js / d_head))
34
+ phi = positions.float().unsqueeze(-1) * theta.unsqueeze(0)
35
+ cos = torch.cos(phi).repeat_interleave(2, dim=-1)
36
+ sin = torch.sin(phi).repeat_interleave(2, dim=-1)
37
+ return cos, sin
38
+
39
+
40
+ def _apply_rope(
41
+ x: torch.Tensor,
42
+ cos: torch.Tensor,
43
+ sin: torch.Tensor,
44
+ ) -> torch.Tensor:
45
+ x_rot = torch.empty_like(x)
46
+ x_rot[..., 0::2] = -x[..., 1::2]
47
+ x_rot[..., 1::2] = x[..., 0::2]
48
+ return (x * cos + x_rot * sin).to(x.dtype)
49
+
50
+
51
+
52
+ class CloverLMAttention(nn.Module):
53
+
54
+ def __init__(
55
+ self,
56
+ d: int,
57
+ num_heads: int,
58
+ num_kv_heads: int,
59
+ head_dim: int,
60
+ cache_config=None,
61
+ quant_config: QuantizationConfig | None = None,
62
+ prefix: str = "",
63
+ ):
64
+ super().__init__()
65
+ self.num_heads = num_heads
66
+ self.num_kv_heads = num_kv_heads
67
+ self.head_dim = head_dim
68
+ self.q_size = num_heads * head_dim
69
+ self.kv_size = num_kv_heads * head_dim
70
+
71
+ self.lq = ColumnParallelLinear(
72
+ d, self.q_size, bias=False,
73
+ quant_config=quant_config,
74
+ prefix=f"{prefix}.lq",
75
+ )
76
+ self.lk = ColumnParallelLinear(
77
+ d, self.kv_size, bias=False,
78
+ quant_config=quant_config,
79
+ prefix=f"{prefix}.lk",
80
+ )
81
+ self.lv = ColumnParallelLinear(
82
+ d, self.kv_size, bias=False,
83
+ quant_config=quant_config,
84
+ prefix=f"{prefix}.lv",
85
+ )
86
+ self.lo = RowParallelLinear(
87
+ self.q_size, d, bias=False,
88
+ quant_config=quant_config,
89
+ prefix=f"{prefix}.lo",
90
+ )
91
+
92
+ # Per-head learnable scale: stored as (1, heads, 1, 1) in checkpoint,
93
+ # reshaped to (heads,) for efficient multiply after sphere norm.
94
+ self.scale = nn.Parameter(
95
+ torch.empty(1, num_heads, 1, 1),
96
+ requires_grad=False,
97
+ )
98
+
99
+ self.attn = Attention(
100
+ num_heads=num_heads,
101
+ head_size=head_dim,
102
+ scale=1.0,
103
+ num_kv_heads=num_kv_heads,
104
+ cache_config=cache_config,
105
+ quant_config=quant_config,
106
+ prefix=f"{prefix}.attn",
107
+ )
108
+
109
+ def forward(
110
+ self,
111
+ positions: torch.Tensor,
112
+ hidden_states: torch.Tensor,
113
+ ) -> torch.Tensor:
114
+ q, _ = self.lq(hidden_states)
115
+ k, _ = self.lk(hidden_states)
116
+ v, _ = self.lv(hidden_states)
117
+
118
+ cos, sin = _build_rope_cos_sin(
119
+ positions, self.head_dim, hidden_states.device,
120
+ )
121
+
122
+ q = q.view(-1, self.num_heads, self.head_dim)
123
+ k = k.view(-1, self.num_kv_heads, self.head_dim)
124
+
125
+ q = _apply_rope(q, cos.unsqueeze(1), sin.unsqueeze(1))
126
+ k = _apply_rope(k, cos.unsqueeze(1), sin.unsqueeze(1))
127
+
128
+ q = F.normalize(q, dim=-1)
129
+ k = F.normalize(k, dim=-1)
130
+
131
+ # scale: (1, heads, 1, 1) → broadcast over (tokens, heads, head_dim)
132
+ q = q * self.scale.squeeze(-1)
133
+
134
+ q = q.reshape(-1, self.q_size)
135
+ k = k.reshape(-1, self.kv_size)
136
+
137
+ attn_output = self.attn(q, k, v)
138
+ output, _ = self.lo(attn_output)
139
+ return output
140
+
141
+
142
+ class CloverLMMLP(nn.Module):
143
+
144
+ def __init__(
145
+ self,
146
+ d: int,
147
+ quant_config: QuantizationConfig | None = None,
148
+ prefix: str = "",
149
+ ):
150
+ super().__init__()
151
+ d_hidden = 4 * d
152
+ self.l1 = ColumnParallelLinear(
153
+ d, d_hidden, bias=False,
154
+ quant_config=quant_config,
155
+ prefix=f"{prefix}.l1.0",
156
+ )
157
+ self.l2 = RowParallelLinear(
158
+ d_hidden, d, bias=False,
159
+ quant_config=quant_config,
160
+ prefix=f"{prefix}.l2",
161
+ )
162
+
163
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
164
+ x, _ = self.l1(x)
165
+ x = F.relu(x) ** 2
166
+ x, _ = self.l2(x)
167
+ return x
168
+
169
+
170
+ class CloverLMBlock(nn.Module):
171
+
172
+ def __init__(
173
+ self,
174
+ d: int,
175
+ num_heads: int,
176
+ num_kv_heads: int,
177
+ head_dim: int,
178
+ cache_config=None,
179
+ quant_config: QuantizationConfig | None = None,
180
+ prefix: str = "",
181
+ ):
182
+ super().__init__()
183
+ self.mhsa = CloverLMAttention(
184
+ d, num_heads, num_kv_heads, head_dim,
185
+ cache_config=cache_config,
186
+ quant_config=quant_config,
187
+ prefix=f"{prefix}.mhsa",
188
+ )
189
+ self.out_att_norm = RMSNorm(d)
190
+ self.mlp = CloverLMMLP(
191
+ d,
192
+ quant_config=quant_config,
193
+ prefix=f"{prefix}.mlp",
194
+ )
195
+ self.out_mlp_norm = RMSNorm(d)
196
+
197
+ def forward(
198
+ self,
199
+ positions: torch.Tensor,
200
+ hidden_states: torch.Tensor,
201
+ ) -> torch.Tensor:
202
+ # Post-norm attention residual
203
+ attn_out = self.mhsa(positions, hidden_states)
204
+ attn_out = self.out_att_norm(attn_out)
205
+ hidden_states = hidden_states + attn_out
206
+
207
+ # Post-norm MLP residual
208
+ mlp_out = self.mlp(hidden_states)
209
+ mlp_out = self.out_mlp_norm(mlp_out)
210
+ hidden_states = hidden_states + mlp_out
211
+
212
+ return hidden_states
213
+
214
+
215
+ class CloverLMModel(nn.Module):
216
+
217
+ def __init__(
218
+ self,
219
+ config,
220
+ cache_config=None,
221
+ quant_config: QuantizationConfig | None = None,
222
+ prefix: str = "",
223
+ ):
224
+ super().__init__()
225
+ self.config = config
226
+ d = config.heads * config.d_head
227
+
228
+ self.emb = VocabParallelEmbedding(
229
+ config.vocab_size, d,
230
+ quant_config=quant_config,
231
+ prefix=f"{prefix}.emb",
232
+ )
233
+ self.blocks = nn.ModuleList([
234
+ CloverLMBlock(
235
+ d, config.heads,
236
+ config.heads // config.ratio,
237
+ config.d_head,
238
+ cache_config=cache_config,
239
+ quant_config=quant_config,
240
+ prefix=f"{prefix}.blocks.{i}",
241
+ )
242
+ for i in range(config.num_blocks)
243
+ ])
244
+ self.out_norm = RMSNorm(d)
245
+
246
+ def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
247
+ return self.emb(input_ids)
248
+
249
+ def forward(
250
+ self,
251
+ input_ids: torch.Tensor,
252
+ positions: torch.Tensor,
253
+ intermediate_tensors=None,
254
+ inputs_embeds: torch.Tensor | None = None,
255
+ ) -> torch.Tensor:
256
+ if inputs_embeds is not None:
257
+ hidden_states = inputs_embeds
258
+ else:
259
+ hidden_states = self.emb(input_ids)
260
+
261
+ for block in self.blocks:
262
+ hidden_states = block(positions, hidden_states)
263
+
264
+ hidden_states = self.out_norm(hidden_states)
265
+ return hidden_states
266
+
267
+
268
+
269
+ _HF_TO_VLLM = WeightsMapper(
270
+ orig_to_new_prefix={"transformer.": "model."},
271
+ )
272
+
273
+
274
+ class CloverLMForCausalLM_vLLM(nn.Module):
275
+
276
+ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
277
+ super().__init__()
278
+ config = vllm_config.model_config.hf_config
279
+ cache_config = vllm_config.cache_config
280
+ quant_config = vllm_config.quant_config
281
+
282
+ d = config.heads * config.d_head
283
+ self.config = config
284
+
285
+ self.model = CloverLMModel(
286
+ config,
287
+ cache_config=cache_config,
288
+ quant_config=quant_config,
289
+ prefix=f"{prefix}model",
290
+ )
291
+
292
+ self.lm_head = ParallelLMHead(
293
+ config.vocab_size, d, bias=False,
294
+ quant_config=quant_config,
295
+ prefix=f"{prefix}lm_head",
296
+ )
297
+ self.logits_processor = LogitsProcessor(config.vocab_size)
298
+
299
+ if getattr(config, "weight_tying", True):
300
+ self.lm_head.weight = self.model.emb.weight
301
+
302
+ def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
303
+ return self.model.embed_input_ids(input_ids)
304
+
305
+ def forward(
306
+ self,
307
+ input_ids: torch.Tensor,
308
+ positions: torch.Tensor,
309
+ intermediate_tensors=None,
310
+ inputs_embeds: torch.Tensor | None = None,
311
+ ) -> torch.Tensor:
312
+ hidden_states = self.model(
313
+ input_ids, positions, intermediate_tensors, inputs_embeds,
314
+ )
315
+ return hidden_states
316
+
317
+ def compute_logits(
318
+ self,
319
+ hidden_states: torch.Tensor,
320
+ ) -> torch.Tensor | None:
321
+ return self.logits_processor(self.lm_head, hidden_states)
322
+
323
+ def load_weights(
324
+ self,
325
+ weights: Iterable[tuple[str, torch.Tensor]],
326
+ ) -> set[str]:
327
+ params_dict = dict(self.named_parameters(remove_duplicate=False))
328
+ loaded: set[str] = set()
329
+
330
+ skip_prefixes = set()
331
+ if getattr(self.config, "weight_tying", True):
332
+ skip_prefixes.add("transformer.linear.weight")
333
+
334
+ skipped = []
335
+ unmapped = []
336
+ for hf_name, loaded_weight in weights:
337
+ if hf_name in skip_prefixes:
338
+ skipped.append(hf_name)
339
+ continue
340
+
341
+ # Map HuggingFace names → vLLM names
342
+ vllm_name = hf_name.replace("transformer.", "model.", 1)
343
+
344
+ # In HuggingFace model, MLP l1 is Sequential(Linear, ReLU²),
345
+ # so the linear weight is at "mlp.l1.0.weight". In our vLLM
346
+ # model l1 is a flat ColumnParallelLinear → "mlp.l1.weight".
347
+ vllm_name = vllm_name.replace(".mlp.l1.0.", ".mlp.l1.")
348
+
349
+ if vllm_name not in params_dict:
350
+ unmapped.append(f"{hf_name} -> {vllm_name}")
351
+ continue
352
+
353
+ param = params_dict[vllm_name]
354
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
355
+ weight_loader(param, loaded_weight)
356
+ loaded.add(vllm_name)
357
+
358
+ not_loaded = set(params_dict.keys()) - loaded
359
+ import logging
360
+ logger = logging.getLogger(__name__)
361
+ logger.info("Loaded %d/%d params, skipped %d, unmapped %d, "
362
+ "not_loaded %d",
363
+ len(loaded), len(params_dict), len(skipped),
364
+ len(unmapped), len(not_loaded))
365
+ if unmapped:
366
+ logger.warning("Unmapped HF keys: %s", unmapped)
367
+ if not_loaded:
368
+ logger.warning("Params not loaded: %s", sorted(not_loaded))
369
+
370
+ return loaded
vllm_plugin/quartet2_quant.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from torch.nn import Parameter
5
+
6
+ from vllm.model_executor.layers.quantization import (
7
+ register_quantization_config,
8
+ )
9
+ from vllm.model_executor.layers.quantization.base_config import (
10
+ QuantizationConfig,
11
+ QuantizeMethodBase,
12
+ )
13
+ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
14
+ from vllm.model_executor.parameter import ModelWeightParameter
15
+
16
+
17
+ @register_quantization_config("quartet2")
18
+ class QuartetIIConfig(QuantizationConfig):
19
+
20
+ def get_name(self) -> str:
21
+ return "quartet2"
22
+
23
+ def get_supported_act_dtypes(self) -> list:
24
+ return [torch.bfloat16]
25
+
26
+ @classmethod
27
+ def get_min_capability(cls) -> int:
28
+ return 100 # Blackwell (SM 10.0)
29
+
30
+ @staticmethod
31
+ def get_config_filenames() -> list[str]:
32
+ return []
33
+
34
+ @classmethod
35
+ def from_config(cls, config: dict) -> "QuartetIIConfig":
36
+ return cls()
37
+
38
+ def get_quant_method(
39
+ self, layer: torch.nn.Module, prefix: str
40
+ ) -> QuantizeMethodBase | None:
41
+ if isinstance(layer, LinearBase):
42
+ return QuartetIILinearMethod(self)
43
+ return None
44
+
45
+
46
+ class QuartetIILinearMethod(LinearMethodBase):
47
+
48
+ def __init__(self, config: QuartetIIConfig):
49
+ self.config = config
50
+
51
+ def create_weights(
52
+ self,
53
+ layer: torch.nn.Module,
54
+ input_size_per_partition: int,
55
+ output_partition_sizes: list[int],
56
+ input_size: int,
57
+ output_size: int,
58
+ params_dtype: torch.dtype,
59
+ **extra_weight_attrs,
60
+ ):
61
+ output_size_per_partition = sum(output_partition_sizes)
62
+ weight = ModelWeightParameter(
63
+ data=torch.empty(
64
+ output_size_per_partition,
65
+ input_size_per_partition,
66
+ dtype=params_dtype,
67
+ ),
68
+ input_dim=1,
69
+ output_dim=0,
70
+ weight_loader=extra_weight_attrs.get("weight_loader"),
71
+ )
72
+ layer.register_parameter("weight", weight)
73
+
74
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
75
+ from scipy.linalg import hadamard as scipy_hadamard
76
+ device = layer.weight.device
77
+ had_np = scipy_hadamard(128) * 128 ** -0.5
78
+ layer.had = torch.tensor(
79
+ had_np, dtype=torch.bfloat16, device=device, requires_grad=False,
80
+ )
81
+ layer.scratch_amax = torch.empty(
82
+ (), dtype=torch.uint32, device=device,
83
+ )
84
+
85
+ def apply(
86
+ self,
87
+ layer: torch.nn.Module,
88
+ x: torch.Tensor,
89
+ bias: torch.Tensor | None = None,
90
+ ) -> torch.Tensor:
91
+ from quartet2.quant import quant_fp4, NVFP4QuantMode
92
+ from quartet2.linear import abs_max, _fp4_mm
93
+
94
+ weight = layer.weight
95
+ orig_shape = x.shape
96
+ flat_x = x.reshape(-1, x.shape[-1])
97
+
98
+ # Quartet II requires rows to be multiples of 128; pad if needed.
99
+ num_rows = flat_x.shape[0]
100
+ remainder = num_rows % 128
101
+ if remainder != 0:
102
+ pad_rows = 128 - remainder
103
+ flat_x = F.pad(flat_x, (0, 0, 0, pad_rows))
104
+ else:
105
+ pad_rows = 0
106
+
107
+ input_amax = abs_max(flat_x)
108
+ weight_amax = abs_max(weight)
109
+
110
+ mode = NVFP4QuantMode.FOUR_SIX
111
+ scale_override = 1.0
112
+
113
+ input_fp4 = quant_fp4(
114
+ flat_x, amax=input_amax,
115
+ scale_override=scale_override, mode=mode,
116
+ )
117
+ weight_fp4 = quant_fp4(
118
+ weight, amax=weight_amax,
119
+ scale_override=scale_override, mode=mode,
120
+ )
121
+
122
+ alpha = input_fp4.tensor_scale * weight_fp4.tensor_scale
123
+ output = _fp4_mm(
124
+ input_fp4.fp4, weight_fp4.fp4,
125
+ input_fp4.micro_scales, weight_fp4.micro_scales,
126
+ alpha,
127
+ )
128
+
129
+ if pad_rows > 0:
130
+ output = output[:num_rows]
131
+
132
+ output = output.reshape(*orig_shape[:-1], output.shape[-1])
133
+ if bias is not None:
134
+ output = output + bias
135
+ return output
vllm_plugin/serve.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+
7
+ PLUGIN_DIR = os.path.dirname(os.path.abspath(__file__))
8
+ MODEL_DIR = os.path.dirname(PLUGIN_DIR)
9
+ sys.path.insert(0, PLUGIN_DIR)
10
+
11
+ # Register the Quartet II quantization plugin before any vLLM imports
12
+ import quartet2_quant # noqa: F401 — triggers @register_quantization_config
13
+
14
+ from vllm import ModelRegistry
15
+ from cloverlm_vllm import CloverLMForCausalLM_vLLM
16
+
17
+ ModelRegistry.register_model(
18
+ "CloverLMForCausalLM", CloverLMForCausalLM_vLLM,
19
+ )
20
+
21
+
22
+ def main():
23
+ parser = argparse.ArgumentParser(description="Serve CloverLM with vLLM")
24
+ parser.add_argument(
25
+ "--model", default=MODEL_DIR,
26
+ help="Path to CloverLM model directory",
27
+ )
28
+ parser.add_argument("--api", action="store_true", help="Start OpenAI API server")
29
+ parser.add_argument("--port", type=int, default=8000)
30
+ parser.add_argument("--host", default="0.0.0.0")
31
+ parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
32
+ parser.add_argument(
33
+ "--max-model-len", type=int, default=1024,
34
+ help="Maximum context length",
35
+ )
36
+ parser.add_argument(
37
+ "--gpu-memory-utilization", type=float, default=0.9,
38
+ )
39
+ args = parser.parse_args()
40
+
41
+ if args.api:
42
+ _serve_api(args)
43
+ else:
44
+ _offline_inference(args)
45
+
46
+
47
+ def _offline_inference(args):
48
+ from vllm import LLM, SamplingParams
49
+
50
+ llm = LLM(
51
+ model=args.model,
52
+ quantization="quartet2",
53
+ trust_remote_code=True,
54
+ dtype="bfloat16",
55
+ max_model_len=args.max_model_len,
56
+ tensor_parallel_size=args.tp,
57
+ gpu_memory_utilization=args.gpu_memory_utilization,
58
+ enforce_eager=True,
59
+ )
60
+
61
+ sampling_params = SamplingParams(
62
+ temperature=0.8,
63
+ top_p=0.95,
64
+ max_tokens=128,
65
+ )
66
+
67
+ prompts = [
68
+ "The capital of France is",
69
+ "Large language models are",
70
+ "In the year 2030,",
71
+ ]
72
+
73
+ print("=" * 60)
74
+ print(" CloverLM — vLLM Offline Inference (Quartet II NVFP4)")
75
+ print("=" * 60)
76
+ outputs = llm.generate(prompts, sampling_params)
77
+ for output in outputs:
78
+ prompt = output.prompt
79
+ generated = output.outputs[0].text
80
+ print(f"\nPrompt: {prompt}")
81
+ print(f"Generated: {generated}")
82
+
83
+
84
+ def _serve_api(args):
85
+ sys.argv = [
86
+ "vllm",
87
+ "--model", args.model,
88
+ "--quantization", "quartet2",
89
+ "--trust-remote-code",
90
+ "--dtype", "bfloat16",
91
+ "--max-model-len", str(args.max_model_len),
92
+ "--tensor-parallel-size", str(args.tp),
93
+ "--gpu-memory-utilization", str(args.gpu_memory_utilization),
94
+ "--enforce-eager",
95
+ "--host", args.host,
96
+ "--port", str(args.port),
97
+ ]
98
+ from vllm.utils.argparse_utils import FlexibleArgumentParser
99
+ from vllm.entrypoints.openai.cli_args import make_arg_parser
100
+ from vllm.entrypoints.openai.api_server import run_server
101
+ import asyncio
102
+
103
+ vllm_parser = make_arg_parser(FlexibleArgumentParser())
104
+ vllm_args = vllm_parser.parse_args()
105
+ asyncio.run(run_server(vllm_args))
106
+
107
+
108
+ if __name__ == "__main__":
109
+ main()