amirali1985 commited on
Commit
ceebdc3
·
verified ·
1 Parent(s): b832f99

Upload add_sub_baseline_10K_2L1H128d

Browse files
add_sub_baseline_10K_2L1H128d/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 128,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 512,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention"
18
+ ],
19
+ "max_position_embeddings": 128,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen3",
22
+ "num_attention_heads": 1,
23
+ "num_hidden_layers": 2,
24
+ "num_key_value_heads": 1,
25
+ "pad_token_id": null,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_parameters": {
28
+ "rope_theta": 10000.0,
29
+ "rope_type": "default"
30
+ },
31
+ "sliding_window": null,
32
+ "tie_word_embeddings": false,
33
+ "transformers_version": "5.5.0",
34
+ "use_cache": true,
35
+ "use_sliding_window": false,
36
+ "vocab_size": 151645
37
+ }
add_sub_baseline_10K_2L1H128d/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_baseline_10K_2L1H128d/metrics.json ADDED
@@ -0,0 +1,831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 400,
12
+ 450,
13
+ 500,
14
+ 550,
15
+ 600,
16
+ 650,
17
+ 700,
18
+ 750,
19
+ 800,
20
+ 850,
21
+ 900,
22
+ 950,
23
+ 1000,
24
+ 1050,
25
+ 1100,
26
+ 1150,
27
+ 1200,
28
+ 1250,
29
+ 1300,
30
+ 1350,
31
+ 1400,
32
+ 1450,
33
+ 1500,
34
+ 1550,
35
+ 1600,
36
+ 1650,
37
+ 1700,
38
+ 1750,
39
+ 1800,
40
+ 1850,
41
+ 1900,
42
+ 1950,
43
+ 2000,
44
+ 2050,
45
+ 2100,
46
+ 2150,
47
+ 2200,
48
+ 2250,
49
+ 2300,
50
+ 2350,
51
+ 2400,
52
+ 2450,
53
+ 2500,
54
+ 2550,
55
+ 2600,
56
+ 2650,
57
+ 2700,
58
+ 2750,
59
+ 2800,
60
+ 2850,
61
+ 2900,
62
+ 2950,
63
+ 3000,
64
+ 3050,
65
+ 3100
66
+ ],
67
+ "loss": [
68
+ 11.632269859313965,
69
+ 10.746880531311035,
70
+ 10.444467544555664,
71
+ 10.006251335144043,
72
+ 9.801725387573242,
73
+ 9.507078170776367,
74
+ 9.177903175354004,
75
+ 8.873117446899414,
76
+ 8.627289772033691,
77
+ 8.425443649291992,
78
+ 8.176294326782227,
79
+ 7.824686527252197,
80
+ 7.672322750091553,
81
+ 7.39719820022583,
82
+ 7.087851524353027,
83
+ 6.878063678741455,
84
+ 6.657628059387207,
85
+ 6.365849018096924,
86
+ 6.228781223297119,
87
+ 5.940927982330322,
88
+ 5.686161994934082,
89
+ 5.552325248718262,
90
+ 5.286587715148926,
91
+ 5.14711856842041,
92
+ 4.970963001251221,
93
+ 4.826735019683838,
94
+ 4.654284954071045,
95
+ 4.47059965133667,
96
+ 4.298427581787109,
97
+ 4.127253532409668,
98
+ 4.005905628204346,
99
+ 3.8805015087127686,
100
+ 3.801790475845337,
101
+ 3.6226108074188232,
102
+ 3.5523719787597656,
103
+ 3.5552258491516113,
104
+ 3.372868061065674,
105
+ 3.3437774181365967,
106
+ 3.2328782081604004,
107
+ 3.155165910720825,
108
+ 3.211038112640381,
109
+ 3.113257884979248,
110
+ 2.977780818939209,
111
+ 3.0747323036193848,
112
+ 2.98803448677063,
113
+ 2.9421255588531494,
114
+ 2.9898440837860107,
115
+ 2.9196650981903076,
116
+ 2.8956546783447266,
117
+ 2.9016549587249756,
118
+ 2.8742430210113525,
119
+ 2.872373342514038,
120
+ 2.8761067390441895,
121
+ 2.807882070541382,
122
+ 2.8126957416534424,
123
+ 2.813640594482422,
124
+ 2.8270370960235596,
125
+ 2.8576278686523438,
126
+ 2.861673593521118,
127
+ 2.809460401535034,
128
+ 2.736629009246826,
129
+ 2.832703113555908
130
+ ],
131
+ "base_loss": [
132
+ 11.632269859313965,
133
+ 10.746880531311035,
134
+ 10.444467544555664,
135
+ 10.006251335144043,
136
+ 9.801725387573242,
137
+ 9.507078170776367,
138
+ 9.177903175354004,
139
+ 8.873117446899414,
140
+ 8.627289772033691,
141
+ 8.425443649291992,
142
+ 8.176294326782227,
143
+ 7.824686527252197,
144
+ 7.672322750091553,
145
+ 7.39719820022583,
146
+ 7.087851524353027,
147
+ 6.878063678741455,
148
+ 6.657628059387207,
149
+ 6.365849018096924,
150
+ 6.228781223297119,
151
+ 5.940927982330322,
152
+ 5.686161994934082,
153
+ 5.552325248718262,
154
+ 5.286587715148926,
155
+ 5.14711856842041,
156
+ 4.970963001251221,
157
+ 4.826735019683838,
158
+ 4.654284954071045,
159
+ 4.47059965133667,
160
+ 4.298427581787109,
161
+ 4.127253532409668,
162
+ 4.005905628204346,
163
+ 3.8805015087127686,
164
+ 3.801790475845337,
165
+ 3.6226108074188232,
166
+ 3.5523719787597656,
167
+ 3.5552258491516113,
168
+ 3.372868061065674,
169
+ 3.3437774181365967,
170
+ 3.2328782081604004,
171
+ 3.155165910720825,
172
+ 3.211038112640381,
173
+ 3.113257884979248,
174
+ 2.977780818939209,
175
+ 3.0747323036193848,
176
+ 2.98803448677063,
177
+ 2.9421255588531494,
178
+ 2.9898440837860107,
179
+ 2.9196650981903076,
180
+ 2.8956546783447266,
181
+ 2.9016549587249756,
182
+ 2.8742430210113525,
183
+ 2.872373342514038,
184
+ 2.8761067390441895,
185
+ 2.807882070541382,
186
+ 2.8126957416534424,
187
+ 2.813640594482422,
188
+ 2.8270370960235596,
189
+ 2.8576278686523438,
190
+ 2.861673593521118,
191
+ 2.809460401535034,
192
+ 2.736629009246826,
193
+ 2.832703113555908
194
+ ],
195
+ "lr": [
196
+ 9.800000000000001e-06,
197
+ 1.98e-05,
198
+ 1.9987181950611177e-05,
199
+ 1.9947710543876128e-05,
200
+ 1.9881685800704465e-05,
201
+ 1.978928396036359e-05,
202
+ 1.9670751670274858e-05,
203
+ 1.9526405327639747e-05,
204
+ 1.9356630234883628e-05,
205
+ 1.916187957117136e-05,
206
+ 1.8942673182740125e-05,
207
+ 1.8699596195278423e-05,
208
+ 1.8433297452055233e-05,
209
+ 1.814448778196835e-05,
210
+ 1.7833938102135036e-05,
211
+ 1.7502477360089763e-05,
212
+ 1.7150990321081803e-05,
213
+ 1.6780415206379127e-05,
214
+ 1.6391741188882608e-05,
215
+ 1.598600575273548e-05,
216
+ 1.556429192397602e-05,
217
+ 1.5127725379625657e-05,
218
+ 1.4677471442929161e-05,
219
+ 1.4214731972767564e-05,
220
+ 1.3740742155546771e-05,
221
+ 1.3256767208125341e-05,
222
+ 1.2764099000582231e-05,
223
+ 1.2264052607839339e-05,
224
+ 1.1757962799343548e-05,
225
+ 1.124718047617836e-05,
226
+ 1.0733069065115504e-05,
227
+ 1.021700087923181e-05,
228
+ 9.700353454805953e-06,
229
+ 9.184505874272968e-06,
230
+ 8.67083508505161e-06,
231
+ 8.160712224070698e-06,
232
+ 7.655498957805332e-06,
233
+ 7.156543847592537e-06,
234
+ 6.6651787499283185e-06,
235
+ 6.182715261354808e-06,
236
+ 5.710441217427128e-06,
237
+ 5.249617255105255e-06,
238
+ 4.801473447746826e-06,
239
+ 4.3672060216831465e-06,
240
+ 3.947974163142735e-06,
241
+ 3.5448969240456945e-06,
242
+ 3.1590502349282503e-06,
243
+ 2.791464032970812e-06,
244
+ 2.4431195127956985e-06,
245
+ 2.1149465073729436e-06,
246
+ 1.8078210060253153e-06,
247
+ 1.522562816157701e-06,
248
+ 1.2599333749523968e-06,
249
+ 1.0206337168715263e-06,
250
+ 8.05302602391903e-07,
251
+ 6.145148129672918e-07,
252
+ 4.4877961676932255e-07,
253
+ 3.0853940930242963e-07,
254
+ 1.941685325214182e-07,
255
+ 1.05972275603764e-07,
256
+ 4.4186060043894804e-08,
257
+ 8.97481124466415e-09
258
+ ],
259
+ "eval_step": [
260
+ 156,
261
+ 312,
262
+ 468,
263
+ 624,
264
+ 780,
265
+ 936,
266
+ 1092,
267
+ 1248,
268
+ 1404,
269
+ 1560,
270
+ 1716,
271
+ 1872,
272
+ 2028,
273
+ 2184,
274
+ 2340,
275
+ 2496,
276
+ 2652,
277
+ 2808,
278
+ 2964,
279
+ 3120
280
+ ],
281
+ "eval_epoch": [
282
+ 1,
283
+ 2,
284
+ 3,
285
+ 4,
286
+ 5,
287
+ 6,
288
+ 7,
289
+ 8,
290
+ 9,
291
+ 10,
292
+ 11,
293
+ 12,
294
+ 13,
295
+ 14,
296
+ 15,
297
+ 16,
298
+ 17,
299
+ 18,
300
+ 19,
301
+ 20
302
+ ],
303
+ "eval_accuracy": [
304
+ 0.0,
305
+ 0.0,
306
+ 0.0,
307
+ 0.0,
308
+ 0.0,
309
+ 0.0,
310
+ 0.0,
311
+ 0.0,
312
+ 0.0,
313
+ 0.0,
314
+ 0.0,
315
+ 0.0,
316
+ 0.0,
317
+ 0.0,
318
+ 0.0,
319
+ 0.0,
320
+ 0.0,
321
+ 0.0,
322
+ 0.0,
323
+ 0.0
324
+ ]
325
+ },
326
+ "final_accuracy": 0.0,
327
+ "sft_eval": {
328
+ "config": {
329
+ "ops": "add_sub",
330
+ "K": null,
331
+ "mode": "sft",
332
+ "n_digits": 6,
333
+ "n_per_split": 100
334
+ },
335
+ "splits": {
336
+ "add_S0": {
337
+ "full_accuracy": 0.0,
338
+ "n_examples": 100,
339
+ "per_subtask": {
340
+ "SA": {
341
+ "accuracy": 0.17851239669421487,
342
+ "count": 605
343
+ },
344
+ "SS": {
345
+ "accuracy": 0.0,
346
+ "count": 95
347
+ }
348
+ }
349
+ },
350
+ "add_S1": {
351
+ "full_accuracy": 0.0,
352
+ "n_examples": 100,
353
+ "per_subtask": {
354
+ "SA": {
355
+ "accuracy": 0.24509803921568626,
356
+ "count": 204
357
+ },
358
+ "SC": {
359
+ "accuracy": 0.22485207100591717,
360
+ "count": 169
361
+ },
362
+ "SS": {
363
+ "accuracy": 0.0,
364
+ "count": 31
365
+ },
366
+ "UC": {
367
+ "accuracy": 0.006756756756756757,
368
+ "count": 296
369
+ }
370
+ }
371
+ },
372
+ "add_S2": {
373
+ "full_accuracy": 0.0,
374
+ "n_examples": 100,
375
+ "per_subtask": {
376
+ "SA": {
377
+ "accuracy": 0.2331288343558282,
378
+ "count": 163
379
+ },
380
+ "SC": {
381
+ "accuracy": 0.2,
382
+ "count": 130
383
+ },
384
+ "SS": {
385
+ "accuracy": 0.0,
386
+ "count": 87
387
+ },
388
+ "UC": {
389
+ "accuracy": 0.0,
390
+ "count": 203
391
+ },
392
+ "US": {
393
+ "accuracy": 1.0,
394
+ "count": 117
395
+ }
396
+ }
397
+ },
398
+ "add_S3": {
399
+ "full_accuracy": 0.0,
400
+ "n_examples": 100,
401
+ "per_subtask": {
402
+ "SA": {
403
+ "accuracy": 0.2727272727272727,
404
+ "count": 121
405
+ },
406
+ "SC": {
407
+ "accuracy": 0.1652892561983471,
408
+ "count": 121
409
+ },
410
+ "SS": {
411
+ "accuracy": 0.0,
412
+ "count": 49
413
+ },
414
+ "UC": {
415
+ "accuracy": 0.0,
416
+ "count": 186
417
+ },
418
+ "US": {
419
+ "accuracy": 1.0,
420
+ "count": 223
421
+ }
422
+ }
423
+ },
424
+ "add_S4": {
425
+ "full_accuracy": 0.0,
426
+ "n_examples": 100,
427
+ "per_subtask": {
428
+ "SA": {
429
+ "accuracy": 0.2403846153846154,
430
+ "count": 104
431
+ },
432
+ "SC": {
433
+ "accuracy": 0.22641509433962265,
434
+ "count": 106
435
+ },
436
+ "SS": {
437
+ "accuracy": 0.0,
438
+ "count": 23
439
+ },
440
+ "UC": {
441
+ "accuracy": 0.0,
442
+ "count": 160
443
+ },
444
+ "US": {
445
+ "accuracy": 1.0,
446
+ "count": 307
447
+ }
448
+ }
449
+ },
450
+ "add_S5": {
451
+ "full_accuracy": 0.0,
452
+ "n_examples": 100,
453
+ "per_subtask": {
454
+ "SA": {
455
+ "accuracy": 0.5,
456
+ "count": 100
457
+ },
458
+ "SC": {
459
+ "accuracy": 0.28,
460
+ "count": 100
461
+ },
462
+ "UC": {
463
+ "accuracy": 0.0,
464
+ "count": 100
465
+ },
466
+ "US": {
467
+ "accuracy": 1.0,
468
+ "count": 400
469
+ }
470
+ }
471
+ },
472
+ "add_S6": {
473
+ "full_accuracy": 0.0,
474
+ "n_examples": 100,
475
+ "per_subtask": {
476
+ "SC": {
477
+ "accuracy": 0.21,
478
+ "count": 100
479
+ },
480
+ "UC": {
481
+ "accuracy": 0.0,
482
+ "count": 100
483
+ },
484
+ "US": {
485
+ "accuracy": 1.0,
486
+ "count": 500
487
+ }
488
+ }
489
+ },
490
+ "add_random": {
491
+ "full_accuracy": 0.0,
492
+ "n_examples": 200,
493
+ "per_subtask": {
494
+ "SA": {
495
+ "accuracy": 0.2371364653243848,
496
+ "count": 447
497
+ },
498
+ "SC": {
499
+ "accuracy": 0.2125,
500
+ "count": 320
501
+ },
502
+ "SS": {
503
+ "accuracy": 0.0,
504
+ "count": 56
505
+ },
506
+ "UC": {
507
+ "accuracy": 0.007561436672967864,
508
+ "count": 529
509
+ },
510
+ "US": {
511
+ "accuracy": 1.0,
512
+ "count": 48
513
+ }
514
+ }
515
+ },
516
+ "add_C3": {
517
+ "full_accuracy": 0.0,
518
+ "n_examples": 100,
519
+ "per_subtask": {
520
+ "SA": {
521
+ "accuracy": 0.31666666666666665,
522
+ "count": 300
523
+ },
524
+ "SC": {
525
+ "accuracy": 0.26,
526
+ "count": 100
527
+ },
528
+ "UC": {
529
+ "accuracy": 0.0051813471502590676,
530
+ "count": 193
531
+ },
532
+ "US": {
533
+ "accuracy": 1.0,
534
+ "count": 107
535
+ }
536
+ }
537
+ },
538
+ "add_C4": {
539
+ "full_accuracy": 0.0,
540
+ "n_examples": 100,
541
+ "per_subtask": {
542
+ "SA": {
543
+ "accuracy": 0.415,
544
+ "count": 200
545
+ },
546
+ "SC": {
547
+ "accuracy": 0.21,
548
+ "count": 100
549
+ },
550
+ "UC": {
551
+ "accuracy": 0.0,
552
+ "count": 256
553
+ },
554
+ "US": {
555
+ "accuracy": 1.0,
556
+ "count": 144
557
+ }
558
+ }
559
+ },
560
+ "add_C5": {
561
+ "full_accuracy": 0.0,
562
+ "n_examples": 100,
563
+ "per_subtask": {
564
+ "SA": {
565
+ "accuracy": 0.64,
566
+ "count": 100
567
+ },
568
+ "SC": {
569
+ "accuracy": 0.23,
570
+ "count": 100
571
+ },
572
+ "UC": {
573
+ "accuracy": 0.0,
574
+ "count": 306
575
+ },
576
+ "US": {
577
+ "accuracy": 1.0,
578
+ "count": 194
579
+ }
580
+ }
581
+ },
582
+ "add_C6": {
583
+ "full_accuracy": 0.0,
584
+ "n_examples": 100,
585
+ "per_subtask": {
586
+ "SC": {
587
+ "accuracy": 0.24,
588
+ "count": 100
589
+ },
590
+ "UC": {
591
+ "accuracy": 0.0,
592
+ "count": 366
593
+ },
594
+ "US": {
595
+ "accuracy": 1.0,
596
+ "count": 234
597
+ }
598
+ }
599
+ },
600
+ "sub_M0": {
601
+ "full_accuracy": 0.0,
602
+ "n_examples": 100,
603
+ "per_subtask": {
604
+ "MD": {
605
+ "accuracy": 0.20465890183028287,
606
+ "count": 601
607
+ },
608
+ "ME": {
609
+ "accuracy": 1.0,
610
+ "count": 99
611
+ }
612
+ }
613
+ },
614
+ "sub_M1": {
615
+ "full_accuracy": 0.0,
616
+ "n_examples": 100,
617
+ "per_subtask": {
618
+ "MD": {
619
+ "accuracy": 0.3835125448028674,
620
+ "count": 279
621
+ },
622
+ "MB": {
623
+ "accuracy": 0.0,
624
+ "count": 145
625
+ },
626
+ "ME": {
627
+ "accuracy": 1.0,
628
+ "count": 24
629
+ },
630
+ "UB": {
631
+ "accuracy": 0.09523809523809523,
632
+ "count": 252
633
+ }
634
+ }
635
+ },
636
+ "sub_M2": {
637
+ "full_accuracy": 0.0,
638
+ "n_examples": 100,
639
+ "per_subtask": {
640
+ "MD": {
641
+ "accuracy": 0.6150234741784038,
642
+ "count": 213
643
+ },
644
+ "MB": {
645
+ "accuracy": 0.0,
646
+ "count": 113
647
+ },
648
+ "ME": {
649
+ "accuracy": 1.0,
650
+ "count": 85
651
+ },
652
+ "UB": {
653
+ "accuracy": 0.16574585635359115,
654
+ "count": 181
655
+ },
656
+ "UD": {
657
+ "accuracy": 0.0,
658
+ "count": 108
659
+ }
660
+ }
661
+ },
662
+ "sub_M3": {
663
+ "full_accuracy": 0.0,
664
+ "n_examples": 100,
665
+ "per_subtask": {
666
+ "MD": {
667
+ "accuracy": 0.7597765363128491,
668
+ "count": 179
669
+ },
670
+ "MB": {
671
+ "accuracy": 0.0,
672
+ "count": 103
673
+ },
674
+ "ME": {
675
+ "accuracy": 1.0,
676
+ "count": 56
677
+ },
678
+ "UB": {
679
+ "accuracy": 0.12080536912751678,
680
+ "count": 149
681
+ },
682
+ "UD": {
683
+ "accuracy": 0.0,
684
+ "count": 213
685
+ }
686
+ }
687
+ },
688
+ "sub_M4": {
689
+ "full_accuracy": 0.0,
690
+ "n_examples": 100,
691
+ "per_subtask": {
692
+ "MD": {
693
+ "accuracy": 0.5,
694
+ "count": 200
695
+ },
696
+ "MB": {
697
+ "accuracy": 0.0,
698
+ "count": 100
699
+ },
700
+ "UB": {
701
+ "accuracy": 0.3,
702
+ "count": 100
703
+ },
704
+ "UD": {
705
+ "accuracy": 0.0,
706
+ "count": 300
707
+ }
708
+ }
709
+ },
710
+ "sub_M5": {
711
+ "full_accuracy": 0.0,
712
+ "n_examples": 100,
713
+ "per_subtask": {
714
+ "MD": {
715
+ "accuracy": 1.0,
716
+ "count": 100
717
+ },
718
+ "MB": {
719
+ "accuracy": 0.0,
720
+ "count": 100
721
+ },
722
+ "UB": {
723
+ "accuracy": 0.31,
724
+ "count": 100
725
+ },
726
+ "UD": {
727
+ "accuracy": 0.0,
728
+ "count": 400
729
+ }
730
+ }
731
+ },
732
+ "sub_random": {
733
+ "full_accuracy": 0.0,
734
+ "n_examples": 200,
735
+ "per_subtask": {
736
+ "MD": {
737
+ "accuracy": 0.3616666666666667,
738
+ "count": 600
739
+ },
740
+ "MB": {
741
+ "accuracy": 0.0,
742
+ "count": 267
743
+ },
744
+ "ME": {
745
+ "accuracy": 1.0,
746
+ "count": 53
747
+ },
748
+ "UB": {
749
+ "accuracy": 0.12072892938496584,
750
+ "count": 439
751
+ },
752
+ "UD": {
753
+ "accuracy": 0.0,
754
+ "count": 41
755
+ }
756
+ }
757
+ },
758
+ "sub_B3": {
759
+ "full_accuracy": 0.0,
760
+ "n_examples": 100,
761
+ "per_subtask": {
762
+ "MD": {
763
+ "accuracy": 0.3333333333333333,
764
+ "count": 300
765
+ },
766
+ "MB": {
767
+ "accuracy": 0.0,
768
+ "count": 100
769
+ },
770
+ "UB": {
771
+ "accuracy": 0.17766497461928935,
772
+ "count": 197
773
+ },
774
+ "UD": {
775
+ "accuracy": 0.0,
776
+ "count": 103
777
+ }
778
+ }
779
+ },
780
+ "sub_B4": {
781
+ "full_accuracy": 0.0,
782
+ "n_examples": 100,
783
+ "per_subtask": {
784
+ "MD": {
785
+ "accuracy": 0.5,
786
+ "count": 200
787
+ },
788
+ "MB": {
789
+ "accuracy": 0.0,
790
+ "count": 100
791
+ },
792
+ "UB": {
793
+ "accuracy": 0.145748987854251,
794
+ "count": 247
795
+ },
796
+ "UD": {
797
+ "accuracy": 0.0,
798
+ "count": 153
799
+ }
800
+ }
801
+ },
802
+ "sub_B5": {
803
+ "full_accuracy": 0.0,
804
+ "n_examples": 100,
805
+ "per_subtask": {
806
+ "MD": {
807
+ "accuracy": 1.0,
808
+ "count": 100
809
+ },
810
+ "MB": {
811
+ "accuracy": 0.0,
812
+ "count": 100
813
+ },
814
+ "UB": {
815
+ "accuracy": 0.11073825503355705,
816
+ "count": 298
817
+ },
818
+ "UD": {
819
+ "accuracy": 0.0,
820
+ "count": 202
821
+ }
822
+ }
823
+ }
824
+ },
825
+ "summary": {
826
+ "overall_accuracy": 0.0,
827
+ "total_examples": 2400,
828
+ "n_splits": 22
829
+ }
830
+ }
831
+ }
add_sub_baseline_10K_2L1H128d/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ed798fb07cbccdc55dcd6c1c4e872d058f4b5c532f1cd70f6a60ef685417772
3
+ size 157692826
add_sub_baseline_10K_2L1H128d/train_config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_rollouts": 4,
3
+ "K": 4,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 2e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 100,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
+ "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
+ "num_epochs": 20,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 156,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_baseline_10K_2L1H128d",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 2,
61
+ "n_head": 1,
62
+ "n_embd": 128,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 0,
65
+ "dataset_size": 10000,
66
+ "mode": "baseline",
67
+ "device": "cuda",
68
+ "push_to_hub": true,
69
+ "no_wandb": false,
70
+ "n_params": 39346560,
71
+ "run_name": "add_sub_baseline_10K_2L1H128d",
72
+ "git_commit": "7c8a203ce79a277a1e41a3ec0648cb73d5b2b760",
73
+ "timestamp": "2026-04-14T01:48:04.187677+00:00",
74
+ "tokenizer": "Qwen/Qwen3-0.6B",
75
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
+ "dataset_config": "add_sub_6digit",
77
+ "model_repo": "thoughtworks/arithmetic-sorl",
78
+ "trainer_version": "sft",
79
+ "wandb_run_id": "mxgukq0x",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/mxgukq0x",
81
+ "final_accuracy": 0.0,
82
+ "sft_accuracy": 0.0,
83
+ "eval_method": "ArithmeticEvaluator"
84
+ }