amirali1985 commited on
Commit
8940757
·
verified ·
1 Parent(s): b77d387

Upload add_sub_sorl_v6_abs30_K1_25K

Browse files
add_sub_sorl_v6_abs30_K1_25K/metrics.json CHANGED
@@ -1717,6 +1717,7 @@
1717
  "splits": {
1718
  "add_S0": {
1719
  "full_accuracy": 0.0,
 
1720
  "n_examples": 100,
1721
  "per_subtask": {
1722
  "SA": {
@@ -1731,6 +1732,7 @@
1731
  },
1732
  "add_S1": {
1733
  "full_accuracy": 0.0,
 
1734
  "n_examples": 100,
1735
  "per_subtask": {
1736
  "SA": {
@@ -1753,6 +1755,7 @@
1753
  },
1754
  "add_S2": {
1755
  "full_accuracy": 0.0,
 
1756
  "n_examples": 100,
1757
  "per_subtask": {
1758
  "SA": {
@@ -1779,6 +1782,7 @@
1779
  },
1780
  "add_S3": {
1781
  "full_accuracy": 0.0,
 
1782
  "n_examples": 100,
1783
  "per_subtask": {
1784
  "SA": {
@@ -1805,6 +1809,7 @@
1805
  },
1806
  "add_S4": {
1807
  "full_accuracy": 0.0,
 
1808
  "n_examples": 100,
1809
  "per_subtask": {
1810
  "SA": {
@@ -1831,6 +1836,7 @@
1831
  },
1832
  "add_S5": {
1833
  "full_accuracy": 0.0,
 
1834
  "n_examples": 100,
1835
  "per_subtask": {
1836
  "SA": {
@@ -1853,6 +1859,7 @@
1853
  },
1854
  "add_S6": {
1855
  "full_accuracy": 0.0,
 
1856
  "n_examples": 100,
1857
  "per_subtask": {
1858
  "SC": {
@@ -1871,6 +1878,7 @@
1871
  },
1872
  "add_random": {
1873
  "full_accuracy": 0.0,
 
1874
  "n_examples": 200,
1875
  "per_subtask": {
1876
  "SA": {
@@ -1897,6 +1905,7 @@
1897
  },
1898
  "add_C3": {
1899
  "full_accuracy": 0.0,
 
1900
  "n_examples": 100,
1901
  "per_subtask": {
1902
  "SA": {
@@ -1919,6 +1928,7 @@
1919
  },
1920
  "add_C4": {
1921
  "full_accuracy": 0.0,
 
1922
  "n_examples": 100,
1923
  "per_subtask": {
1924
  "SA": {
@@ -1941,6 +1951,7 @@
1941
  },
1942
  "add_C5": {
1943
  "full_accuracy": 0.0,
 
1944
  "n_examples": 100,
1945
  "per_subtask": {
1946
  "SA": {
@@ -1963,6 +1974,7 @@
1963
  },
1964
  "add_C6": {
1965
  "full_accuracy": 0.0,
 
1966
  "n_examples": 100,
1967
  "per_subtask": {
1968
  "SC": {
@@ -1981,6 +1993,7 @@
1981
  },
1982
  "sub_M0": {
1983
  "full_accuracy": 0.0,
 
1984
  "n_examples": 100,
1985
  "per_subtask": {
1986
  "MD": {
@@ -1995,6 +2008,7 @@
1995
  },
1996
  "sub_M1": {
1997
  "full_accuracy": 0.0,
 
1998
  "n_examples": 100,
1999
  "per_subtask": {
2000
  "MD": {
@@ -2017,6 +2031,7 @@
2017
  },
2018
  "sub_M2": {
2019
  "full_accuracy": 0.0,
 
2020
  "n_examples": 100,
2021
  "per_subtask": {
2022
  "MD": {
@@ -2043,6 +2058,7 @@
2043
  },
2044
  "sub_M3": {
2045
  "full_accuracy": 0.0,
 
2046
  "n_examples": 100,
2047
  "per_subtask": {
2048
  "MD": {
@@ -2069,6 +2085,7 @@
2069
  },
2070
  "sub_M4": {
2071
  "full_accuracy": 0.0,
 
2072
  "n_examples": 100,
2073
  "per_subtask": {
2074
  "MD": {
@@ -2091,6 +2108,7 @@
2091
  },
2092
  "sub_M5": {
2093
  "full_accuracy": 0.0,
 
2094
  "n_examples": 100,
2095
  "per_subtask": {
2096
  "MD": {
@@ -2113,6 +2131,7 @@
2113
  },
2114
  "sub_random": {
2115
  "full_accuracy": 0.0,
 
2116
  "n_examples": 200,
2117
  "per_subtask": {
2118
  "MD": {
@@ -2139,6 +2158,7 @@
2139
  },
2140
  "sub_B3": {
2141
  "full_accuracy": 0.0,
 
2142
  "n_examples": 100,
2143
  "per_subtask": {
2144
  "MD": {
@@ -2161,6 +2181,7 @@
2161
  },
2162
  "sub_B4": {
2163
  "full_accuracy": 0.0,
 
2164
  "n_examples": 100,
2165
  "per_subtask": {
2166
  "MD": {
@@ -2183,6 +2204,7 @@
2183
  },
2184
  "sub_B5": {
2185
  "full_accuracy": 0.0,
 
2186
  "n_examples": 100,
2187
  "per_subtask": {
2188
  "MD": {
@@ -2206,6 +2228,7 @@
2206
  },
2207
  "summary": {
2208
  "overall_accuracy": 0.0,
 
2209
  "total_examples": 2400,
2210
  "n_splits": 22
2211
  }
@@ -2221,6 +2244,7 @@
2221
  "splits": {
2222
  "add_S0": {
2223
  "full_accuracy": 0.99,
 
2224
  "n_examples": 100,
2225
  "per_subtask": {
2226
  "SA": {
@@ -2235,6 +2259,7 @@
2235
  },
2236
  "add_S1": {
2237
  "full_accuracy": 0.98,
 
2238
  "n_examples": 100,
2239
  "per_subtask": {
2240
  "SA": {
@@ -2257,6 +2282,7 @@
2257
  },
2258
  "add_S2": {
2259
  "full_accuracy": 0.96,
 
2260
  "n_examples": 100,
2261
  "per_subtask": {
2262
  "SA": {
@@ -2283,6 +2309,7 @@
2283
  },
2284
  "add_S3": {
2285
  "full_accuracy": 0.86,
 
2286
  "n_examples": 100,
2287
  "per_subtask": {
2288
  "SA": {
@@ -2309,6 +2336,7 @@
2309
  },
2310
  "add_S4": {
2311
  "full_accuracy": 0.68,
 
2312
  "n_examples": 100,
2313
  "per_subtask": {
2314
  "SA": {
@@ -2335,6 +2363,7 @@
2335
  },
2336
  "add_S5": {
2337
  "full_accuracy": 0.4,
 
2338
  "n_examples": 100,
2339
  "per_subtask": {
2340
  "SA": {
@@ -2357,6 +2386,7 @@
2357
  },
2358
  "add_S6": {
2359
  "full_accuracy": 0.61,
 
2360
  "n_examples": 100,
2361
  "per_subtask": {
2362
  "SC": {
@@ -2375,6 +2405,7 @@
2375
  },
2376
  "add_random": {
2377
  "full_accuracy": 0.97,
 
2378
  "n_examples": 200,
2379
  "per_subtask": {
2380
  "SA": {
@@ -2401,6 +2432,7 @@
2401
  },
2402
  "add_C3": {
2403
  "full_accuracy": 0.86,
 
2404
  "n_examples": 100,
2405
  "per_subtask": {
2406
  "SA": {
@@ -2423,6 +2455,7 @@
2423
  },
2424
  "add_C4": {
2425
  "full_accuracy": 0.85,
 
2426
  "n_examples": 100,
2427
  "per_subtask": {
2428
  "SA": {
@@ -2445,6 +2478,7 @@
2445
  },
2446
  "add_C5": {
2447
  "full_accuracy": 0.85,
 
2448
  "n_examples": 100,
2449
  "per_subtask": {
2450
  "SA": {
@@ -2467,6 +2501,7 @@
2467
  },
2468
  "add_C6": {
2469
  "full_accuracy": 0.9,
 
2470
  "n_examples": 100,
2471
  "per_subtask": {
2472
  "SC": {
@@ -2485,6 +2520,7 @@
2485
  },
2486
  "sub_M0": {
2487
  "full_accuracy": 1.0,
 
2488
  "n_examples": 100,
2489
  "per_subtask": {
2490
  "MD": {
@@ -2499,6 +2535,7 @@
2499
  },
2500
  "sub_M1": {
2501
  "full_accuracy": 1.0,
 
2502
  "n_examples": 100,
2503
  "per_subtask": {
2504
  "MD": {
@@ -2521,6 +2558,7 @@
2521
  },
2522
  "sub_M2": {
2523
  "full_accuracy": 1.0,
 
2524
  "n_examples": 100,
2525
  "per_subtask": {
2526
  "MD": {
@@ -2547,6 +2585,7 @@
2547
  },
2548
  "sub_M3": {
2549
  "full_accuracy": 0.76,
 
2550
  "n_examples": 100,
2551
  "per_subtask": {
2552
  "MD": {
@@ -2573,6 +2612,7 @@
2573
  },
2574
  "sub_M4": {
2575
  "full_accuracy": 0.43,
 
2576
  "n_examples": 100,
2577
  "per_subtask": {
2578
  "MD": {
@@ -2595,6 +2635,7 @@
2595
  },
2596
  "sub_M5": {
2597
  "full_accuracy": 0.07,
 
2598
  "n_examples": 100,
2599
  "per_subtask": {
2600
  "MD": {
@@ -2617,6 +2658,7 @@
2617
  },
2618
  "sub_random": {
2619
  "full_accuracy": 0.985,
 
2620
  "n_examples": 200,
2621
  "per_subtask": {
2622
  "MD": {
@@ -2643,6 +2685,7 @@
2643
  },
2644
  "sub_B3": {
2645
  "full_accuracy": 0.85,
 
2646
  "n_examples": 100,
2647
  "per_subtask": {
2648
  "MD": {
@@ -2665,6 +2708,7 @@
2665
  },
2666
  "sub_B4": {
2667
  "full_accuracy": 0.8,
 
2668
  "n_examples": 100,
2669
  "per_subtask": {
2670
  "MD": {
@@ -2687,6 +2731,7 @@
2687
  },
2688
  "sub_B5": {
2689
  "full_accuracy": 0.8,
 
2690
  "n_examples": 100,
2691
  "per_subtask": {
2692
  "MD": {
@@ -2710,6 +2755,7 @@
2710
  },
2711
  "summary": {
2712
  "overall_accuracy": 0.815,
 
2713
  "total_examples": 2400,
2714
  "n_splits": 22
2715
  }
 
1717
  "splits": {
1718
  "add_S0": {
1719
  "full_accuracy": 0.0,
1720
+ "digit_accuracy": 0.1442857142857143,
1721
  "n_examples": 100,
1722
  "per_subtask": {
1723
  "SA": {
 
1732
  },
1733
  "add_S1": {
1734
  "full_accuracy": 0.0,
1735
+ "digit_accuracy": 0.11571428571428571,
1736
  "n_examples": 100,
1737
  "per_subtask": {
1738
  "SA": {
 
1755
  },
1756
  "add_S2": {
1757
  "full_accuracy": 0.0,
1758
+ "digit_accuracy": 0.12142857142857143,
1759
  "n_examples": 100,
1760
  "per_subtask": {
1761
  "SA": {
 
1782
  },
1783
  "add_S3": {
1784
  "full_accuracy": 0.0,
1785
+ "digit_accuracy": 0.10428571428571429,
1786
  "n_examples": 100,
1787
  "per_subtask": {
1788
  "SA": {
 
1809
  },
1810
  "add_S4": {
1811
  "full_accuracy": 0.0,
1812
+ "digit_accuracy": 0.13142857142857142,
1813
  "n_examples": 100,
1814
  "per_subtask": {
1815
  "SA": {
 
1836
  },
1837
  "add_S5": {
1838
  "full_accuracy": 0.0,
1839
+ "digit_accuracy": 0.09857142857142857,
1840
  "n_examples": 100,
1841
  "per_subtask": {
1842
  "SA": {
 
1859
  },
1860
  "add_S6": {
1861
  "full_accuracy": 0.0,
1862
+ "digit_accuracy": 0.11571428571428571,
1863
  "n_examples": 100,
1864
  "per_subtask": {
1865
  "SC": {
 
1878
  },
1879
  "add_random": {
1880
  "full_accuracy": 0.0,
1881
+ "digit_accuracy": 0.13214285714285715,
1882
  "n_examples": 200,
1883
  "per_subtask": {
1884
  "SA": {
 
1905
  },
1906
  "add_C3": {
1907
  "full_accuracy": 0.0,
1908
+ "digit_accuracy": 0.15142857142857144,
1909
  "n_examples": 100,
1910
  "per_subtask": {
1911
  "SA": {
 
1928
  },
1929
  "add_C4": {
1930
  "full_accuracy": 0.0,
1931
+ "digit_accuracy": 0.15285714285714286,
1932
  "n_examples": 100,
1933
  "per_subtask": {
1934
  "SA": {
 
1951
  },
1952
  "add_C5": {
1953
  "full_accuracy": 0.0,
1954
+ "digit_accuracy": 0.11571428571428571,
1955
  "n_examples": 100,
1956
  "per_subtask": {
1957
  "SA": {
 
1974
  },
1975
  "add_C6": {
1976
  "full_accuracy": 0.0,
1977
+ "digit_accuracy": 0.11285714285714285,
1978
  "n_examples": 100,
1979
  "per_subtask": {
1980
  "SC": {
 
1993
  },
1994
  "sub_M0": {
1995
  "full_accuracy": 0.0,
1996
+ "digit_accuracy": 0.12857142857142856,
1997
  "n_examples": 100,
1998
  "per_subtask": {
1999
  "MD": {
 
2008
  },
2009
  "sub_M1": {
2010
  "full_accuracy": 0.0,
2011
+ "digit_accuracy": 0.13142857142857142,
2012
  "n_examples": 100,
2013
  "per_subtask": {
2014
  "MD": {
 
2031
  },
2032
  "sub_M2": {
2033
  "full_accuracy": 0.0,
2034
+ "digit_accuracy": 0.12,
2035
  "n_examples": 100,
2036
  "per_subtask": {
2037
  "MD": {
 
2058
  },
2059
  "sub_M3": {
2060
  "full_accuracy": 0.0,
2061
+ "digit_accuracy": 0.13714285714285715,
2062
  "n_examples": 100,
2063
  "per_subtask": {
2064
  "MD": {
 
2085
  },
2086
  "sub_M4": {
2087
  "full_accuracy": 0.0,
2088
+ "digit_accuracy": 0.12571428571428572,
2089
  "n_examples": 100,
2090
  "per_subtask": {
2091
  "MD": {
 
2108
  },
2109
  "sub_M5": {
2110
  "full_accuracy": 0.0,
2111
+ "digit_accuracy": 0.14142857142857143,
2112
  "n_examples": 100,
2113
  "per_subtask": {
2114
  "MD": {
 
2131
  },
2132
  "sub_random": {
2133
  "full_accuracy": 0.0,
2134
+ "digit_accuracy": 0.13714285714285715,
2135
  "n_examples": 200,
2136
  "per_subtask": {
2137
  "MD": {
 
2158
  },
2159
  "sub_B3": {
2160
  "full_accuracy": 0.0,
2161
+ "digit_accuracy": 0.14,
2162
  "n_examples": 100,
2163
  "per_subtask": {
2164
  "MD": {
 
2181
  },
2182
  "sub_B4": {
2183
  "full_accuracy": 0.0,
2184
+ "digit_accuracy": 0.13714285714285715,
2185
  "n_examples": 100,
2186
  "per_subtask": {
2187
  "MD": {
 
2204
  },
2205
  "sub_B5": {
2206
  "full_accuracy": 0.0,
2207
+ "digit_accuracy": 0.16428571428571428,
2208
  "n_examples": 100,
2209
  "per_subtask": {
2210
  "MD": {
 
2228
  },
2229
  "summary": {
2230
  "overall_accuracy": 0.0,
2231
+ "digit_accuracy": 0.13023809523809524,
2232
  "total_examples": 2400,
2233
  "n_splits": 22
2234
  }
 
2244
  "splits": {
2245
  "add_S0": {
2246
  "full_accuracy": 0.99,
2247
+ "digit_accuracy": 0.9985714285714286,
2248
  "n_examples": 100,
2249
  "per_subtask": {
2250
  "SA": {
 
2259
  },
2260
  "add_S1": {
2261
  "full_accuracy": 0.98,
2262
+ "digit_accuracy": 0.9971428571428571,
2263
  "n_examples": 100,
2264
  "per_subtask": {
2265
  "SA": {
 
2282
  },
2283
  "add_S2": {
2284
  "full_accuracy": 0.96,
2285
+ "digit_accuracy": 0.9942857142857143,
2286
  "n_examples": 100,
2287
  "per_subtask": {
2288
  "SA": {
 
2309
  },
2310
  "add_S3": {
2311
  "full_accuracy": 0.86,
2312
+ "digit_accuracy": 0.9757142857142858,
2313
  "n_examples": 100,
2314
  "per_subtask": {
2315
  "SA": {
 
2336
  },
2337
  "add_S4": {
2338
  "full_accuracy": 0.68,
2339
+ "digit_accuracy": 0.9328571428571428,
2340
  "n_examples": 100,
2341
  "per_subtask": {
2342
  "SA": {
 
2363
  },
2364
  "add_S5": {
2365
  "full_accuracy": 0.4,
2366
+ "digit_accuracy": 0.8,
2367
  "n_examples": 100,
2368
  "per_subtask": {
2369
  "SA": {
 
2386
  },
2387
  "add_S6": {
2388
  "full_accuracy": 0.61,
2389
+ "digit_accuracy": 0.8271428571428572,
2390
  "n_examples": 100,
2391
  "per_subtask": {
2392
  "SC": {
 
2405
  },
2406
  "add_random": {
2407
  "full_accuracy": 0.97,
2408
+ "digit_accuracy": 0.9957142857142857,
2409
  "n_examples": 200,
2410
  "per_subtask": {
2411
  "SA": {
 
2432
  },
2433
  "add_C3": {
2434
  "full_accuracy": 0.86,
2435
+ "digit_accuracy": 0.9742857142857143,
2436
  "n_examples": 100,
2437
  "per_subtask": {
2438
  "SA": {
 
2455
  },
2456
  "add_C4": {
2457
  "full_accuracy": 0.85,
2458
+ "digit_accuracy": 0.9714285714285714,
2459
  "n_examples": 100,
2460
  "per_subtask": {
2461
  "SA": {
 
2478
  },
2479
  "add_C5": {
2480
  "full_accuracy": 0.85,
2481
+ "digit_accuracy": 0.9685714285714285,
2482
  "n_examples": 100,
2483
  "per_subtask": {
2484
  "SA": {
 
2501
  },
2502
  "add_C6": {
2503
  "full_accuracy": 0.9,
2504
+ "digit_accuracy": 0.9757142857142858,
2505
  "n_examples": 100,
2506
  "per_subtask": {
2507
  "SC": {
 
2520
  },
2521
  "sub_M0": {
2522
  "full_accuracy": 1.0,
2523
+ "digit_accuracy": 1.0,
2524
  "n_examples": 100,
2525
  "per_subtask": {
2526
  "MD": {
 
2535
  },
2536
  "sub_M1": {
2537
  "full_accuracy": 1.0,
2538
+ "digit_accuracy": 1.0,
2539
  "n_examples": 100,
2540
  "per_subtask": {
2541
  "MD": {
 
2558
  },
2559
  "sub_M2": {
2560
  "full_accuracy": 1.0,
2561
+ "digit_accuracy": 1.0,
2562
  "n_examples": 100,
2563
  "per_subtask": {
2564
  "MD": {
 
2585
  },
2586
  "sub_M3": {
2587
  "full_accuracy": 0.76,
2588
+ "digit_accuracy": 0.9614285714285714,
2589
  "n_examples": 100,
2590
  "per_subtask": {
2591
  "MD": {
 
2612
  },
2613
  "sub_M4": {
2614
  "full_accuracy": 0.43,
2615
+ "digit_accuracy": 0.87,
2616
  "n_examples": 100,
2617
  "per_subtask": {
2618
  "MD": {
 
2635
  },
2636
  "sub_M5": {
2637
  "full_accuracy": 0.07,
2638
+ "digit_accuracy": 0.7042857142857143,
2639
  "n_examples": 100,
2640
  "per_subtask": {
2641
  "MD": {
 
2658
  },
2659
  "sub_random": {
2660
  "full_accuracy": 0.985,
2661
+ "digit_accuracy": 0.9971428571428571,
2662
  "n_examples": 200,
2663
  "per_subtask": {
2664
  "MD": {
 
2685
  },
2686
  "sub_B3": {
2687
  "full_accuracy": 0.85,
2688
+ "digit_accuracy": 0.9785714285714285,
2689
  "n_examples": 100,
2690
  "per_subtask": {
2691
  "MD": {
 
2708
  },
2709
  "sub_B4": {
2710
  "full_accuracy": 0.8,
2711
+ "digit_accuracy": 0.9657142857142857,
2712
  "n_examples": 100,
2713
  "per_subtask": {
2714
  "MD": {
 
2731
  },
2732
  "sub_B5": {
2733
  "full_accuracy": 0.8,
2734
+ "digit_accuracy": 0.9614285714285714,
2735
  "n_examples": 100,
2736
  "per_subtask": {
2737
  "MD": {
 
2755
  },
2756
  "summary": {
2757
  "overall_accuracy": 0.815,
2758
+ "digit_accuracy": 0.9516666666666667,
2759
  "total_examples": 2400,
2760
  "n_splits": 22
2761
  }
add_sub_sorl_v6_abs30_K1_25K/train_config.json CHANGED
@@ -69,15 +69,15 @@
69
  "no_wandb": false,
70
  "n_params": 162519662,
71
  "run_name": "add_sub_sorl_v6_abs30_K1_25K",
72
- "git_commit": "17e935f460a7f9595b705c1d614101a6b0e520f7",
73
- "timestamp": "2026-04-14T10:04:30.519578+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v6",
79
- "wandb_run_id": "0l1r06x6",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/0l1r06x6",
81
  "final_accuracy": 0.815,
82
  "sft_accuracy": 0.0,
83
  "eval_method": "ArithmeticEvaluator"
 
69
  "no_wandb": false,
70
  "n_params": 162519662,
71
  "run_name": "add_sub_sorl_v6_abs30_K1_25K",
72
+ "git_commit": "ec82dee57b1073e52f06ffb0efedc7bce16fff21",
73
+ "timestamp": "2026-04-14T10:38:56.263883+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v6",
79
+ "wandb_run_id": "55qc1hsd",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/55qc1hsd",
81
  "final_accuracy": 0.815,
82
  "sft_accuracy": 0.0,
83
  "eval_method": "ArithmeticEvaluator"