amirali1985 commited on
Commit
9229b8d
·
verified ·
1 Parent(s): da0cbcd

Upload add_sub_baseline_10K_2L1H128d

Browse files
add_sub_baseline_10K_2L1H128d/metrics.json CHANGED
@@ -65,132 +65,132 @@
65
  3100
66
  ],
67
  "loss": [
68
- 11.632269859313965,
69
- 10.746880531311035,
70
- 10.444467544555664,
71
- 10.006251335144043,
72
- 9.801725387573242,
73
- 9.507078170776367,
74
- 9.177903175354004,
75
- 8.873117446899414,
76
- 8.627289772033691,
77
- 8.425443649291992,
78
- 8.176294326782227,
79
- 7.824686527252197,
80
- 7.672322750091553,
81
- 7.39719820022583,
82
- 7.087851524353027,
83
- 6.878063678741455,
84
- 6.657628059387207,
85
- 6.365849018096924,
86
- 6.228781223297119,
87
- 5.940927982330322,
88
- 5.686161994934082,
89
- 5.552325248718262,
90
- 5.286587715148926,
91
- 5.14711856842041,
92
- 4.970963001251221,
93
- 4.826735019683838,
94
- 4.654284954071045,
95
- 4.47059965133667,
96
- 4.298427581787109,
97
- 4.127253532409668,
98
- 4.005905628204346,
99
- 3.8805015087127686,
100
- 3.801790475845337,
101
- 3.6226108074188232,
102
- 3.5523719787597656,
103
- 3.5552258491516113,
104
- 3.372868061065674,
105
- 3.3437774181365967,
106
- 3.2328782081604004,
107
- 3.155165910720825,
108
- 3.211038112640381,
109
- 3.113257884979248,
110
- 2.977780818939209,
111
- 3.0747323036193848,
112
- 2.98803448677063,
113
- 2.9421255588531494,
114
- 2.9898440837860107,
115
- 2.9196650981903076,
116
- 2.8956546783447266,
117
- 2.9016549587249756,
118
- 2.8742430210113525,
119
- 2.872373342514038,
120
- 2.8761067390441895,
121
- 2.807882070541382,
122
- 2.8126957416534424,
123
- 2.813640594482422,
124
- 2.8270370960235596,
125
- 2.8576278686523438,
126
- 2.861673593521118,
127
- 2.809460401535034,
128
- 2.736629009246826,
129
- 2.832703113555908
130
  ],
131
  "base_loss": [
132
- 11.632269859313965,
133
- 10.746880531311035,
134
- 10.444467544555664,
135
- 10.006251335144043,
136
- 9.801725387573242,
137
- 9.507078170776367,
138
- 9.177903175354004,
139
- 8.873117446899414,
140
- 8.627289772033691,
141
- 8.425443649291992,
142
- 8.176294326782227,
143
- 7.824686527252197,
144
- 7.672322750091553,
145
- 7.39719820022583,
146
- 7.087851524353027,
147
- 6.878063678741455,
148
- 6.657628059387207,
149
- 6.365849018096924,
150
- 6.228781223297119,
151
- 5.940927982330322,
152
- 5.686161994934082,
153
- 5.552325248718262,
154
- 5.286587715148926,
155
- 5.14711856842041,
156
- 4.970963001251221,
157
- 4.826735019683838,
158
- 4.654284954071045,
159
- 4.47059965133667,
160
- 4.298427581787109,
161
- 4.127253532409668,
162
- 4.005905628204346,
163
- 3.8805015087127686,
164
- 3.801790475845337,
165
- 3.6226108074188232,
166
- 3.5523719787597656,
167
- 3.5552258491516113,
168
- 3.372868061065674,
169
- 3.3437774181365967,
170
- 3.2328782081604004,
171
- 3.155165910720825,
172
- 3.211038112640381,
173
- 3.113257884979248,
174
- 2.977780818939209,
175
- 3.0747323036193848,
176
- 2.98803448677063,
177
- 2.9421255588531494,
178
- 2.9898440837860107,
179
- 2.9196650981903076,
180
- 2.8956546783447266,
181
- 2.9016549587249756,
182
- 2.8742430210113525,
183
- 2.872373342514038,
184
- 2.8761067390441895,
185
- 2.807882070541382,
186
- 2.8126957416534424,
187
- 2.813640594482422,
188
- 2.8270370960235596,
189
- 2.8576278686523438,
190
- 2.861673593521118,
191
- 2.809460401535034,
192
- 2.736629009246826,
193
- 2.832703113555908
194
  ],
195
  "lr": [
196
  9.800000000000001e-06,
@@ -330,167 +330,167 @@
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
- "n_per_split": 50
334
  },
335
  "splits": {
336
  "add_S0": {
337
  "full_accuracy": 0.0,
338
- "digit_accuracy": 0.14857142857142858,
339
- "n_examples": 50,
340
  "per_subtask": {
341
  "SA": {
342
- "accuracy": 0.17627118644067796,
343
- "count": 295
344
  },
345
  "SS": {
346
  "accuracy": 0.0,
347
- "count": 55
348
  }
349
  }
350
  },
351
  "add_S1": {
352
  "full_accuracy": 0.0,
353
- "digit_accuracy": 0.1657142857142857,
354
- "n_examples": 50,
355
  "per_subtask": {
356
  "SA": {
357
- "accuracy": 0.30158730158730157,
358
- "count": 126
359
  },
360
  "SC": {
361
- "accuracy": 0.25316455696202533,
362
- "count": 79
363
  },
364
  "SS": {
365
  "accuracy": 0.0,
366
- "count": 21
367
  },
368
  "UC": {
369
- "accuracy": 0.0,
370
- "count": 124
371
  }
372
  }
373
  },
374
  "add_S2": {
375
  "full_accuracy": 0.0,
376
- "digit_accuracy": 0.28285714285714286,
377
- "n_examples": 50,
378
  "per_subtask": {
379
  "SA": {
380
- "accuracy": 0.24,
381
- "count": 75
382
  },
383
  "SC": {
384
- "accuracy": 0.2903225806451613,
385
- "count": 62
386
  },
387
  "SS": {
388
  "accuracy": 0.0,
389
- "count": 39
390
  },
391
  "UC": {
392
- "accuracy": 0.0,
393
- "count": 111
394
  },
395
  "US": {
396
  "accuracy": 1.0,
397
- "count": 63
398
  }
399
  }
400
  },
401
  "add_S3": {
402
  "full_accuracy": 0.0,
403
  "digit_accuracy": 0.3942857142857143,
404
- "n_examples": 50,
405
  "per_subtask": {
406
  "SA": {
407
- "accuracy": 0.26666666666666666,
408
- "count": 60
409
  },
410
  "SC": {
411
- "accuracy": 0.21052631578947367,
412
- "count": 57
413
  },
414
  "SS": {
415
  "accuracy": 0.0,
416
- "count": 19
417
  },
418
  "UC": {
419
  "accuracy": 0.0,
420
- "count": 104
421
  },
422
  "US": {
423
  "accuracy": 1.0,
424
- "count": 110
425
  }
426
  }
427
  },
428
  "add_S4": {
429
  "full_accuracy": 0.0,
430
  "digit_accuracy": 0.5085714285714286,
431
- "n_examples": 50,
432
  "per_subtask": {
433
  "SA": {
434
- "accuracy": 0.20833333333333334,
435
- "count": 48
436
  },
437
  "SC": {
438
- "accuracy": 0.2692307692307692,
439
- "count": 52
440
  },
441
  "SS": {
442
  "accuracy": 0.0,
443
- "count": 7
444
  },
445
  "UC": {
446
  "accuracy": 0.0,
447
- "count": 89
448
  },
449
  "US": {
450
  "accuracy": 1.0,
451
- "count": 154
452
  }
453
  }
454
  },
455
  "add_S5": {
456
  "full_accuracy": 0.0,
457
- "digit_accuracy": 0.7,
458
- "n_examples": 50,
459
  "per_subtask": {
460
  "SA": {
461
- "accuracy": 0.54,
462
- "count": 50
463
  },
464
  "SC": {
465
- "accuracy": 0.36,
466
- "count": 50
467
  },
468
  "UC": {
469
  "accuracy": 0.0,
470
- "count": 50
471
  },
472
  "US": {
473
  "accuracy": 1.0,
474
- "count": 200
475
  }
476
  }
477
  },
478
  "add_S6": {
479
  "full_accuracy": 0.0,
480
- "digit_accuracy": 0.7628571428571429,
481
- "n_examples": 50,
482
  "per_subtask": {
483
  "SC": {
484
- "accuracy": 0.34,
485
- "count": 50
486
  },
487
  "UC": {
488
  "accuracy": 0.0,
489
- "count": 50
490
  },
491
  "US": {
492
  "accuracy": 1.0,
493
- "count": 250
494
  }
495
  }
496
  },
@@ -500,292 +500,292 @@
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.23665893271461716,
504
- "count": 431
505
  },
506
  "SC": {
507
- "accuracy": 0.1962025316455696,
508
- "count": 316
509
  },
510
  "SS": {
511
  "accuracy": 0.0,
512
- "count": 39
513
  },
514
  "UC": {
515
- "accuracy": 0.007142857142857143,
516
- "count": 560
517
  },
518
  "US": {
519
  "accuracy": 1.0,
520
- "count": 54
521
  }
522
  }
523
  },
524
  "add_C1": {
525
  "full_accuracy": 0.0,
526
- "digit_accuracy": 0.22857142857142856,
527
- "n_examples": 50,
528
  "per_subtask": {
529
  "SA": {
530
- "accuracy": 0.272,
531
- "count": 250
532
  },
533
  "SC": {
534
- "accuracy": 0.24,
535
- "count": 50
536
  },
537
  "UC": {
538
  "accuracy": 0.0,
539
- "count": 50
540
  }
541
  }
542
  },
543
  "add_C2": {
544
  "full_accuracy": 0.0,
545
- "digit_accuracy": 0.26571428571428574,
546
- "n_examples": 50,
547
  "per_subtask": {
548
  "SA": {
549
- "accuracy": 0.32,
550
- "count": 200
551
  },
552
  "SC": {
553
- "accuracy": 0.22,
554
- "count": 50
555
  },
556
  "UC": {
557
- "accuracy": 0.012048192771084338,
558
- "count": 83
559
  },
560
  "US": {
561
  "accuracy": 1.0,
562
- "count": 17
563
  }
564
  }
565
  },
566
  "add_C3": {
567
  "full_accuracy": 0.0,
568
- "digit_accuracy": 0.34285714285714286,
569
- "n_examples": 50,
570
  "per_subtask": {
571
  "SA": {
572
- "accuracy": 0.37333333333333335,
573
- "count": 150
574
  },
575
  "SC": {
576
- "accuracy": 0.28,
577
- "count": 50
578
  },
579
  "UC": {
580
- "accuracy": 0.0,
581
- "count": 100
582
  },
583
  "US": {
584
  "accuracy": 1.0,
585
- "count": 50
586
  }
587
  }
588
  },
589
  "add_C4": {
590
  "full_accuracy": 0.0,
591
- "digit_accuracy": 0.3457142857142857,
592
- "n_examples": 50,
593
  "per_subtask": {
594
  "SA": {
595
- "accuracy": 0.44,
596
- "count": 100
597
  },
598
  "SC": {
599
- "accuracy": 0.18,
600
- "count": 50
601
  },
602
  "UC": {
603
  "accuracy": 0.0,
604
- "count": 132
605
  },
606
  "US": {
607
  "accuracy": 1.0,
608
- "count": 68
609
  }
610
  }
611
  },
612
  "add_C5": {
613
  "full_accuracy": 0.0,
614
  "digit_accuracy": 0.38857142857142857,
615
- "n_examples": 50,
616
  "per_subtask": {
617
  "SA": {
618
- "accuracy": 0.48,
619
- "count": 50
620
  },
621
  "SC": {
622
- "accuracy": 0.16,
623
- "count": 50
624
  },
625
  "UC": {
626
- "accuracy": 0.0,
627
- "count": 146
628
  },
629
  "US": {
630
  "accuracy": 1.0,
631
- "count": 104
632
  }
633
  }
634
  },
635
  "add_C6": {
636
  "full_accuracy": 0.0,
637
- "digit_accuracy": 0.3514285714285714,
638
- "n_examples": 50,
639
  "per_subtask": {
640
  "SC": {
641
- "accuracy": 0.22,
642
- "count": 50
643
  },
644
  "UC": {
645
- "accuracy": 0.005291005291005291,
646
- "count": 189
647
  },
648
  "US": {
649
  "accuracy": 1.0,
650
- "count": 111
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
  "full_accuracy": 0.0,
656
- "digit_accuracy": 0.3057142857142857,
657
- "n_examples": 50,
658
  "per_subtask": {
659
  "MD": {
660
- "accuracy": 0.19801980198019803,
661
- "count": 303
662
  },
663
  "ME": {
664
  "accuracy": 1.0,
665
- "count": 47
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
  "full_accuracy": 0.0,
671
- "digit_accuracy": 0.26285714285714284,
672
- "n_examples": 50,
673
  "per_subtask": {
674
  "MD": {
675
- "accuracy": 0.3900709219858156,
676
- "count": 141
677
  },
678
  "MB": {
679
  "accuracy": 0.0,
680
- "count": 72
681
  },
682
  "ME": {
683
  "accuracy": 1.0,
684
- "count": 18
685
  },
686
  "UB": {
687
- "accuracy": 0.15966386554621848,
688
- "count": 119
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
  "full_accuracy": 0.0,
694
- "digit_accuracy": 0.38285714285714284,
695
- "n_examples": 50,
696
  "per_subtask": {
697
  "MD": {
698
- "accuracy": 0.6428571428571429,
699
- "count": 112
700
  },
701
  "MB": {
702
  "accuracy": 0.0,
703
- "count": 53
704
  },
705
  "ME": {
706
  "accuracy": 1.0,
707
- "count": 47
708
  },
709
  "UB": {
710
- "accuracy": 0.17647058823529413,
711
- "count": 85
712
  },
713
  "UD": {
714
  "accuracy": 0.0,
715
- "count": 53
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
  "full_accuracy": 0.0,
721
- "digit_accuracy": 0.28,
722
- "n_examples": 50,
723
  "per_subtask": {
724
  "MD": {
725
- "accuracy": 0.6494845360824743,
726
- "count": 97
727
  },
728
  "MB": {
729
  "accuracy": 0.0,
730
- "count": 51
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
- "count": 27
735
  },
736
  "UB": {
737
- "accuracy": 0.10810810810810811,
738
- "count": 74
739
  },
740
  "UD": {
741
  "accuracy": 0.0,
742
- "count": 101
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
  "full_accuracy": 0.0,
748
- "digit_accuracy": 0.21142857142857144,
749
- "n_examples": 50,
750
  "per_subtask": {
751
  "MD": {
752
  "accuracy": 0.5,
753
- "count": 100
754
  },
755
  "MB": {
756
  "accuracy": 0.0,
757
- "count": 50
758
  },
759
  "UB": {
760
- "accuracy": 0.48,
761
- "count": 50
762
  },
763
  "UD": {
764
  "accuracy": 0.0,
765
- "count": 150
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
  "full_accuracy": 0.0,
771
- "digit_accuracy": 0.18285714285714286,
772
- "n_examples": 50,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
- "count": 50
777
  },
778
  "MB": {
779
  "accuracy": 0.0,
780
- "count": 50
781
  },
782
  "UB": {
783
- "accuracy": 0.28,
784
- "count": 50
785
  },
786
  "UD": {
787
  "accuracy": 0.0,
788
- "count": 200
789
  }
790
  }
791
  },
@@ -795,101 +795,101 @@
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
- "accuracy": 0.37719298245614036,
799
- "count": 570
800
  },
801
  "MB": {
802
  "accuracy": 0.0,
803
- "count": 277
804
  },
805
  "ME": {
806
  "accuracy": 1.0,
807
  "count": 53
808
  },
809
  "UB": {
810
- "accuracy": 0.11677282377919321,
811
- "count": 471
812
  },
813
  "UD": {
814
  "accuracy": 0.0,
815
- "count": 29
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
  "full_accuracy": 0.0,
821
- "digit_accuracy": 0.18857142857142858,
822
- "n_examples": 50,
823
  "per_subtask": {
824
  "MD": {
825
  "accuracy": 0.3333333333333333,
826
- "count": 150
827
  },
828
  "MB": {
829
  "accuracy": 0.0,
830
- "count": 50
831
  },
832
  "UB": {
833
- "accuracy": 0.15841584158415842,
834
- "count": 101
835
  },
836
  "UD": {
837
  "accuracy": 0.0,
838
- "count": 49
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
  "full_accuracy": 0.0,
844
- "digit_accuracy": 0.17714285714285713,
845
- "n_examples": 50,
846
  "per_subtask": {
847
  "MD": {
848
  "accuracy": 0.5,
849
- "count": 100
850
  },
851
  "MB": {
852
  "accuracy": 0.0,
853
- "count": 50
854
  },
855
  "UB": {
856
- "accuracy": 0.09917355371900827,
857
- "count": 121
858
  },
859
  "UD": {
860
  "accuracy": 0.0,
861
- "count": 79
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
  "full_accuracy": 0.0,
867
- "digit_accuracy": 0.18,
868
- "n_examples": 50,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
- "count": 50
873
  },
874
  "MB": {
875
  "accuracy": 0.0,
876
- "count": 50
877
  },
878
  "UB": {
879
- "accuracy": 0.08552631578947369,
880
- "count": 152
881
  },
882
  "UD": {
883
  "accuracy": 0.0,
884
- "count": 98
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
  "overall_accuracy": 0.0,
891
- "digit_accuracy": 0.2867619047619048,
892
- "total_examples": 1500,
893
  "n_splits": 24
894
  }
895
  }
 
65
  3100
66
  ],
67
  "loss": [
68
+ 11.642115592956543,
69
+ 10.80324649810791,
70
+ 10.34417724609375,
71
+ 10.04995059967041,
72
+ 9.737959861755371,
73
+ 9.45477294921875,
74
+ 9.285505294799805,
75
+ 8.961015701293945,
76
+ 8.579405784606934,
77
+ 8.42731761932373,
78
+ 8.175429344177246,
79
+ 7.936665058135986,
80
+ 7.630360126495361,
81
+ 7.336586952209473,
82
+ 7.065025329589844,
83
+ 6.900625705718994,
84
+ 6.648900508880615,
85
+ 6.47622013092041,
86
+ 6.186995029449463,
87
+ 5.953526020050049,
88
+ 5.7461323738098145,
89
+ 5.58115291595459,
90
+ 5.248443603515625,
91
+ 5.121757507324219,
92
+ 4.968733787536621,
93
+ 4.763223171234131,
94
+ 4.608892917633057,
95
+ 4.488630771636963,
96
+ 4.2728352546691895,
97
+ 4.094915866851807,
98
+ 4.042762756347656,
99
+ 3.8673317432403564,
100
+ 3.705319881439209,
101
+ 3.728566884994507,
102
+ 3.5413622856140137,
103
+ 3.5115506649017334,
104
+ 3.3972582817077637,
105
+ 3.352463483810425,
106
+ 3.3156659603118896,
107
+ 3.187150478363037,
108
+ 3.145035982131958,
109
+ 3.1074206829071045,
110
+ 3.0765254497528076,
111
+ 3.0211308002471924,
112
+ 2.9677698612213135,
113
+ 2.983156204223633,
114
+ 2.895720958709717,
115
+ 2.862177610397339,
116
+ 2.9088594913482666,
117
+ 2.9126336574554443,
118
+ 2.8361401557922363,
119
+ 2.851780414581299,
120
+ 2.870859384536743,
121
+ 2.7742607593536377,
122
+ 2.7949109077453613,
123
+ 2.836515426635742,
124
+ 2.8090367317199707,
125
+ 2.846834421157837,
126
+ 2.775794506072998,
127
+ 2.7425057888031006,
128
+ 2.8071517944335938,
129
+ 2.822736978530884
130
  ],
131
  "base_loss": [
132
+ 11.642115592956543,
133
+ 10.80324649810791,
134
+ 10.34417724609375,
135
+ 10.04995059967041,
136
+ 9.737959861755371,
137
+ 9.45477294921875,
138
+ 9.285505294799805,
139
+ 8.961015701293945,
140
+ 8.579405784606934,
141
+ 8.42731761932373,
142
+ 8.175429344177246,
143
+ 7.936665058135986,
144
+ 7.630360126495361,
145
+ 7.336586952209473,
146
+ 7.065025329589844,
147
+ 6.900625705718994,
148
+ 6.648900508880615,
149
+ 6.47622013092041,
150
+ 6.186995029449463,
151
+ 5.953526020050049,
152
+ 5.7461323738098145,
153
+ 5.58115291595459,
154
+ 5.248443603515625,
155
+ 5.121757507324219,
156
+ 4.968733787536621,
157
+ 4.763223171234131,
158
+ 4.608892917633057,
159
+ 4.488630771636963,
160
+ 4.2728352546691895,
161
+ 4.094915866851807,
162
+ 4.042762756347656,
163
+ 3.8673317432403564,
164
+ 3.705319881439209,
165
+ 3.728566884994507,
166
+ 3.5413622856140137,
167
+ 3.5115506649017334,
168
+ 3.3972582817077637,
169
+ 3.352463483810425,
170
+ 3.3156659603118896,
171
+ 3.187150478363037,
172
+ 3.145035982131958,
173
+ 3.1074206829071045,
174
+ 3.0765254497528076,
175
+ 3.0211308002471924,
176
+ 2.9677698612213135,
177
+ 2.983156204223633,
178
+ 2.895720958709717,
179
+ 2.862177610397339,
180
+ 2.9088594913482666,
181
+ 2.9126336574554443,
182
+ 2.8361401557922363,
183
+ 2.851780414581299,
184
+ 2.870859384536743,
185
+ 2.7742607593536377,
186
+ 2.7949109077453613,
187
+ 2.836515426635742,
188
+ 2.8090367317199707,
189
+ 2.846834421157837,
190
+ 2.775794506072998,
191
+ 2.7425057888031006,
192
+ 2.8071517944335938,
193
+ 2.822736978530884
194
  ],
195
  "lr": [
196
  9.800000000000001e-06,
 
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
+ "n_per_split": 100
334
  },
335
  "splits": {
336
  "add_S0": {
337
  "full_accuracy": 0.0,
338
+ "digit_accuracy": 0.15428571428571428,
339
+ "n_examples": 100,
340
  "per_subtask": {
341
  "SA": {
342
+ "accuracy": 0.17851239669421487,
343
+ "count": 605
344
  },
345
  "SS": {
346
  "accuracy": 0.0,
347
+ "count": 95
348
  }
349
  }
350
  },
351
  "add_S1": {
352
  "full_accuracy": 0.0,
353
+ "digit_accuracy": 0.12857142857142856,
354
+ "n_examples": 100,
355
  "per_subtask": {
356
  "SA": {
357
+ "accuracy": 0.24509803921568626,
358
+ "count": 204
359
  },
360
  "SC": {
361
+ "accuracy": 0.22485207100591717,
362
+ "count": 169
363
  },
364
  "SS": {
365
  "accuracy": 0.0,
366
+ "count": 31
367
  },
368
  "UC": {
369
+ "accuracy": 0.006756756756756757,
370
+ "count": 296
371
  }
372
  }
373
  },
374
  "add_S2": {
375
  "full_accuracy": 0.0,
376
+ "digit_accuracy": 0.26142857142857145,
377
+ "n_examples": 100,
378
  "per_subtask": {
379
  "SA": {
380
+ "accuracy": 0.2331288343558282,
381
+ "count": 163
382
  },
383
  "SC": {
384
+ "accuracy": 0.2,
385
+ "count": 130
386
  },
387
  "SS": {
388
  "accuracy": 0.0,
389
+ "count": 87
390
  },
391
  "UC": {
392
+ "accuracy": 0.009852216748768473,
393
+ "count": 203
394
  },
395
  "US": {
396
  "accuracy": 1.0,
397
+ "count": 117
398
  }
399
  }
400
  },
401
  "add_S3": {
402
  "full_accuracy": 0.0,
403
  "digit_accuracy": 0.3942857142857143,
404
+ "n_examples": 100,
405
  "per_subtask": {
406
  "SA": {
407
+ "accuracy": 0.2727272727272727,
408
+ "count": 121
409
  },
410
  "SC": {
411
+ "accuracy": 0.1652892561983471,
412
+ "count": 121
413
  },
414
  "SS": {
415
  "accuracy": 0.0,
416
+ "count": 49
417
  },
418
  "UC": {
419
  "accuracy": 0.0,
420
+ "count": 186
421
  },
422
  "US": {
423
  "accuracy": 1.0,
424
+ "count": 223
425
  }
426
  }
427
  },
428
  "add_S4": {
429
  "full_accuracy": 0.0,
430
  "digit_accuracy": 0.5085714285714286,
431
+ "n_examples": 100,
432
  "per_subtask": {
433
  "SA": {
434
+ "accuracy": 0.2403846153846154,
435
+ "count": 104
436
  },
437
  "SC": {
438
+ "accuracy": 0.22641509433962265,
439
+ "count": 106
440
  },
441
  "SS": {
442
  "accuracy": 0.0,
443
+ "count": 23
444
  },
445
  "UC": {
446
  "accuracy": 0.0,
447
+ "count": 160
448
  },
449
  "US": {
450
  "accuracy": 1.0,
451
+ "count": 307
452
  }
453
  }
454
  },
455
  "add_S5": {
456
  "full_accuracy": 0.0,
457
+ "digit_accuracy": 0.6828571428571428,
458
+ "n_examples": 100,
459
  "per_subtask": {
460
  "SA": {
461
+ "accuracy": 0.5,
462
+ "count": 100
463
  },
464
  "SC": {
465
+ "accuracy": 0.28,
466
+ "count": 100
467
  },
468
  "UC": {
469
  "accuracy": 0.0,
470
+ "count": 100
471
  },
472
  "US": {
473
  "accuracy": 1.0,
474
+ "count": 400
475
  }
476
  }
477
  },
478
  "add_S6": {
479
  "full_accuracy": 0.0,
480
+ "digit_accuracy": 0.7442857142857143,
481
+ "n_examples": 100,
482
  "per_subtask": {
483
  "SC": {
484
+ "accuracy": 0.21,
485
+ "count": 100
486
  },
487
  "UC": {
488
  "accuracy": 0.0,
489
+ "count": 100
490
  },
491
  "US": {
492
  "accuracy": 1.0,
493
+ "count": 500
494
  }
495
  }
496
  },
 
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.2371364653243848,
504
+ "count": 447
505
  },
506
  "SC": {
507
+ "accuracy": 0.2125,
508
+ "count": 320
509
  },
510
  "SS": {
511
  "accuracy": 0.0,
512
+ "count": 56
513
  },
514
  "UC": {
515
+ "accuracy": 0.0,
516
+ "count": 529
517
  },
518
  "US": {
519
  "accuracy": 1.0,
520
+ "count": 48
521
  }
522
  }
523
  },
524
  "add_C1": {
525
  "full_accuracy": 0.0,
526
+ "digit_accuracy": 0.22428571428571428,
527
+ "n_examples": 100,
528
  "per_subtask": {
529
  "SA": {
530
+ "accuracy": 0.262,
531
+ "count": 500
532
  },
533
  "SC": {
534
+ "accuracy": 0.26,
535
+ "count": 100
536
  },
537
  "UC": {
538
  "accuracy": 0.0,
539
+ "count": 100
540
  }
541
  }
542
  },
543
  "add_C2": {
544
  "full_accuracy": 0.0,
545
+ "digit_accuracy": 0.26142857142857145,
546
+ "n_examples": 100,
547
  "per_subtask": {
548
  "SA": {
549
+ "accuracy": 0.3025,
550
+ "count": 400
551
  },
552
  "SC": {
553
+ "accuracy": 0.18,
554
+ "count": 100
555
  },
556
  "UC": {
557
+ "accuracy": 0.0,
558
+ "count": 156
559
  },
560
  "US": {
561
  "accuracy": 1.0,
562
+ "count": 44
563
  }
564
  }
565
  },
566
  "add_C3": {
567
  "full_accuracy": 0.0,
568
+ "digit_accuracy": 0.32571428571428573,
569
+ "n_examples": 100,
570
  "per_subtask": {
571
  "SA": {
572
+ "accuracy": 0.3333333333333333,
573
+ "count": 300
574
  },
575
  "SC": {
576
+ "accuracy": 0.26,
577
+ "count": 100
578
  },
579
  "UC": {
580
+ "accuracy": 0.005025125628140704,
581
+ "count": 199
582
  },
583
  "US": {
584
  "accuracy": 1.0,
585
+ "count": 101
586
  }
587
  }
588
  },
589
  "add_C4": {
590
  "full_accuracy": 0.0,
591
+ "digit_accuracy": 0.3485714285714286,
592
+ "n_examples": 100,
593
  "per_subtask": {
594
  "SA": {
595
+ "accuracy": 0.42,
596
+ "count": 200
597
  },
598
  "SC": {
599
+ "accuracy": 0.24,
600
+ "count": 100
601
  },
602
  "UC": {
603
  "accuracy": 0.0,
604
+ "count": 264
605
  },
606
  "US": {
607
  "accuracy": 1.0,
608
+ "count": 136
609
  }
610
  }
611
  },
612
  "add_C5": {
613
  "full_accuracy": 0.0,
614
  "digit_accuracy": 0.38857142857142857,
615
+ "n_examples": 100,
616
  "per_subtask": {
617
  "SA": {
618
+ "accuracy": 0.59,
619
+ "count": 100
620
  },
621
  "SC": {
622
+ "accuracy": 0.22,
623
+ "count": 100
624
  },
625
  "UC": {
626
+ "accuracy": 0.0032258064516129032,
627
+ "count": 310
628
  },
629
  "US": {
630
  "accuracy": 1.0,
631
+ "count": 190
632
  }
633
  }
634
  },
635
  "add_C6": {
636
  "full_accuracy": 0.0,
637
+ "digit_accuracy": 0.3585714285714286,
638
+ "n_examples": 100,
639
  "per_subtask": {
640
  "SC": {
641
+ "accuracy": 0.2,
642
+ "count": 100
643
  },
644
  "UC": {
645
+ "accuracy": 0.002702702702702703,
646
+ "count": 370
647
  },
648
  "US": {
649
  "accuracy": 1.0,
650
+ "count": 230
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
  "full_accuracy": 0.0,
656
+ "digit_accuracy": 0.29285714285714287,
657
+ "n_examples": 100,
658
  "per_subtask": {
659
  "MD": {
660
+ "accuracy": 0.1951219512195122,
661
+ "count": 615
662
  },
663
  "ME": {
664
  "accuracy": 1.0,
665
+ "count": 85
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
  "full_accuracy": 0.0,
671
+ "digit_accuracy": 0.22428571428571428,
672
+ "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
675
+ "accuracy": 0.3698630136986301,
676
+ "count": 292
677
  },
678
  "MB": {
679
  "accuracy": 0.0,
680
+ "count": 144
681
  },
682
  "ME": {
683
  "accuracy": 1.0,
684
+ "count": 25
685
  },
686
  "UB": {
687
+ "accuracy": 0.100418410041841,
688
+ "count": 239
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
  "full_accuracy": 0.0,
694
+ "digit_accuracy": 0.35428571428571426,
695
+ "n_examples": 100,
696
  "per_subtask": {
697
  "MD": {
698
+ "accuracy": 0.6208530805687204,
699
+ "count": 211
700
  },
701
  "MB": {
702
  "accuracy": 0.0,
703
+ "count": 115
704
  },
705
  "ME": {
706
  "accuracy": 1.0,
707
+ "count": 85
708
  },
709
  "UB": {
710
+ "accuracy": 0.17679558011049723,
711
+ "count": 181
712
  },
713
  "UD": {
714
  "accuracy": 0.0,
715
+ "count": 108
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
  "full_accuracy": 0.0,
721
+ "digit_accuracy": 0.3,
722
+ "n_examples": 100,
723
  "per_subtask": {
724
  "MD": {
725
+ "accuracy": 0.7597765363128491,
726
+ "count": 179
727
  },
728
  "MB": {
729
  "accuracy": 0.0,
730
+ "count": 103
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
+ "count": 56
735
  },
736
  "UB": {
737
+ "accuracy": 0.12080536912751678,
738
+ "count": 149
739
  },
740
  "UD": {
741
  "accuracy": 0.0,
742
+ "count": 213
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
  "full_accuracy": 0.0,
748
+ "digit_accuracy": 0.18571428571428572,
749
+ "n_examples": 100,
750
  "per_subtask": {
751
  "MD": {
752
  "accuracy": 0.5,
753
+ "count": 200
754
  },
755
  "MB": {
756
  "accuracy": 0.0,
757
+ "count": 100
758
  },
759
  "UB": {
760
+ "accuracy": 0.3,
761
+ "count": 100
762
  },
763
  "UD": {
764
  "accuracy": 0.0,
765
+ "count": 300
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
  "full_accuracy": 0.0,
771
+ "digit_accuracy": 0.18714285714285714,
772
+ "n_examples": 100,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
+ "count": 100
777
  },
778
  "MB": {
779
  "accuracy": 0.0,
780
+ "count": 100
781
  },
782
  "UB": {
783
+ "accuracy": 0.31,
784
+ "count": 100
785
  },
786
  "UD": {
787
  "accuracy": 0.0,
788
+ "count": 400
789
  }
790
  }
791
  },
 
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
+ "accuracy": 0.3616666666666667,
799
+ "count": 600
800
  },
801
  "MB": {
802
  "accuracy": 0.0,
803
+ "count": 267
804
  },
805
  "ME": {
806
  "accuracy": 1.0,
807
  "count": 53
808
  },
809
  "UB": {
810
+ "accuracy": 0.12072892938496584,
811
+ "count": 439
812
  },
813
  "UD": {
814
  "accuracy": 0.0,
815
+ "count": 41
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
  "full_accuracy": 0.0,
821
+ "digit_accuracy": 0.19285714285714287,
822
+ "n_examples": 100,
823
  "per_subtask": {
824
  "MD": {
825
  "accuracy": 0.3333333333333333,
826
+ "count": 300
827
  },
828
  "MB": {
829
  "accuracy": 0.0,
830
+ "count": 100
831
  },
832
  "UB": {
833
+ "accuracy": 0.17766497461928935,
834
+ "count": 197
835
  },
836
  "UD": {
837
  "accuracy": 0.0,
838
+ "count": 103
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
  "full_accuracy": 0.0,
844
+ "digit_accuracy": 0.19428571428571428,
845
+ "n_examples": 100,
846
  "per_subtask": {
847
  "MD": {
848
  "accuracy": 0.5,
849
+ "count": 200
850
  },
851
  "MB": {
852
  "accuracy": 0.0,
853
+ "count": 100
854
  },
855
  "UB": {
856
+ "accuracy": 0.145748987854251,
857
+ "count": 247
858
  },
859
  "UD": {
860
  "accuracy": 0.0,
861
+ "count": 153
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
  "full_accuracy": 0.0,
867
+ "digit_accuracy": 0.19,
868
+ "n_examples": 100,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
+ "count": 100
873
  },
874
  "MB": {
875
  "accuracy": 0.0,
876
+ "count": 100
877
  },
878
  "UB": {
879
+ "accuracy": 0.11073825503355705,
880
+ "count": 298
881
  },
882
  "UD": {
883
  "accuracy": 0.0,
884
+ "count": 202
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
  "overall_accuracy": 0.0,
891
+ "digit_accuracy": 0.29516483516483516,
892
+ "total_examples": 2600,
893
  "n_splits": 24
894
  }
895
  }
add_sub_baseline_10K_2L1H128d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ed798fb07cbccdc55dcd6c1c4e872d058f4b5c532f1cd70f6a60ef685417772
3
  size 157692826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a71d76648b9a847738fd64980f8c9b4abe52d0c748af9c58ccaa59d116373e7
3
  size 157692826
add_sub_baseline_10K_2L1H128d/train_config.json CHANGED
@@ -69,15 +69,19 @@
69
  "no_wandb": false,
70
  "n_params": 39346560,
71
  "run_name": "add_sub_baseline_10K_2L1H128d",
72
- "git_commit": "17e935f460a7f9595b705c1d614101a6b0e520f7",
73
- "timestamp": "2026-04-14T06:22:14.025912+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "lhp7f41c",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/lhp7f41c",
 
 
 
81
  "final_accuracy": 0.0,
82
  "sft_accuracy": 0.0,
83
  "eval_method": "ArithmeticEvaluator"
 
69
  "no_wandb": false,
70
  "n_params": 39346560,
71
  "run_name": "add_sub_baseline_10K_2L1H128d",
72
+ "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
+ "timestamp": "2026-04-15T11:36:11.554711+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_10K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "zshwdb6w",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/zshwdb6w",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
  "final_accuracy": 0.0,
86
  "sft_accuracy": 0.0,
87
  "eval_method": "ArithmeticEvaluator"