| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "grad_norm": 0.8453039526939392, |
| "learning_rate": 3.6e-06, |
| "loss": 1.2287, |
| "step": 10 |
| }, |
| { |
| "grad_norm": 0.22706136107444763, |
| "learning_rate": 7.6e-06, |
| "loss": 1.2158, |
| "step": 20 |
| }, |
| { |
| "grad_norm": 0.22833316028118134, |
| "learning_rate": 1.16e-05, |
| "loss": 1.2064, |
| "step": 30 |
| }, |
| { |
| "grad_norm": 0.2991141974925995, |
| "learning_rate": 1.56e-05, |
| "loss": 1.1477, |
| "step": 40 |
| }, |
| { |
| "grad_norm": 0.2874332666397095, |
| "learning_rate": 1.9600000000000002e-05, |
| "loss": 1.1045, |
| "step": 50 |
| }, |
| { |
| "grad_norm": 0.2624056935310364, |
| "learning_rate": 2.36e-05, |
| "loss": 1.0811, |
| "step": 60 |
| }, |
| { |
| "grad_norm": 0.2788711190223694, |
| "learning_rate": 2.7600000000000003e-05, |
| "loss": 1.057, |
| "step": 70 |
| }, |
| { |
| "grad_norm": 1.4561371803283691, |
| "learning_rate": 3.16e-05, |
| "loss": 0.9902, |
| "step": 80 |
| }, |
| { |
| "grad_norm": 0.8279716372489929, |
| "learning_rate": 3.56e-05, |
| "loss": 0.9047, |
| "step": 90 |
| }, |
| { |
| "grad_norm": 1.0444140434265137, |
| "learning_rate": 3.960000000000001e-05, |
| "loss": 0.8148, |
| "step": 100 |
| }, |
| { |
| "grad_norm": 1.9998027086257935, |
| "learning_rate": 4.36e-05, |
| "loss": 0.6477, |
| "step": 110 |
| }, |
| { |
| "grad_norm": 2.0924854278564453, |
| "learning_rate": 4.76e-05, |
| "loss": 0.4322, |
| "step": 120 |
| }, |
| { |
| "grad_norm": 1.3061543703079224, |
| "learning_rate": 5.16e-05, |
| "loss": 0.2847, |
| "step": 130 |
| }, |
| { |
| "grad_norm": 1.3199659585952759, |
| "learning_rate": 5.560000000000001e-05, |
| "loss": 0.2087, |
| "step": 140 |
| }, |
| { |
| "grad_norm": 0.7965296506881714, |
| "learning_rate": 5.96e-05, |
| "loss": 0.1827, |
| "step": 150 |
| }, |
| { |
| "grad_norm": 0.9607883095741272, |
| "learning_rate": 6.36e-05, |
| "loss": 0.1609, |
| "step": 160 |
| }, |
| { |
| "grad_norm": 0.8130837082862854, |
| "learning_rate": 6.76e-05, |
| "loss": 0.1486, |
| "step": 170 |
| }, |
| { |
| "grad_norm": 1.5720325708389282, |
| "learning_rate": 7.16e-05, |
| "loss": 0.1425, |
| "step": 180 |
| }, |
| { |
| "grad_norm": 1.010038137435913, |
| "learning_rate": 7.560000000000001e-05, |
| "loss": 0.1395, |
| "step": 190 |
| }, |
| { |
| "grad_norm": 1.1851860284805298, |
| "learning_rate": 7.960000000000001e-05, |
| "loss": 0.1346, |
| "step": 200 |
| }, |
| { |
| "grad_norm": 0.951931893825531, |
| "learning_rate": 8.36e-05, |
| "loss": 0.1284, |
| "step": 210 |
| }, |
| { |
| "grad_norm": 0.8772835731506348, |
| "learning_rate": 8.76e-05, |
| "loss": 0.1156, |
| "step": 220 |
| }, |
| { |
| "grad_norm": 0.7618266344070435, |
| "learning_rate": 9.16e-05, |
| "loss": 0.1112, |
| "step": 230 |
| }, |
| { |
| "grad_norm": 0.7764372229576111, |
| "learning_rate": 9.56e-05, |
| "loss": 0.112, |
| "step": 240 |
| }, |
| { |
| "grad_norm": 0.6424781680107117, |
| "learning_rate": 9.960000000000001e-05, |
| "loss": 0.1063, |
| "step": 250 |
| }, |
| { |
| "grad_norm": 0.6300824284553528, |
| "learning_rate": 0.00010360000000000001, |
| "loss": 0.106, |
| "step": 260 |
| }, |
| { |
| "grad_norm": 0.7829186916351318, |
| "learning_rate": 0.00010760000000000001, |
| "loss": 0.1049, |
| "step": 270 |
| }, |
| { |
| "grad_norm": 0.6146113276481628, |
| "learning_rate": 0.00011160000000000002, |
| "loss": 0.0996, |
| "step": 280 |
| }, |
| { |
| "grad_norm": 0.9386025071144104, |
| "learning_rate": 0.00011559999999999999, |
| "loss": 0.0994, |
| "step": 290 |
| }, |
| { |
| "grad_norm": 0.7355742454528809, |
| "learning_rate": 0.00011960000000000001, |
| "loss": 0.1023, |
| "step": 300 |
| }, |
| { |
| "grad_norm": 0.7597235441207886, |
| "learning_rate": 0.0001236, |
| "loss": 0.1022, |
| "step": 310 |
| }, |
| { |
| "grad_norm": 0.7152740955352783, |
| "learning_rate": 0.0001276, |
| "loss": 0.0997, |
| "step": 320 |
| }, |
| { |
| "grad_norm": 0.6420713067054749, |
| "learning_rate": 0.0001316, |
| "loss": 0.0942, |
| "step": 330 |
| }, |
| { |
| "grad_norm": 0.7762007117271423, |
| "learning_rate": 0.00013560000000000002, |
| "loss": 0.0961, |
| "step": 340 |
| }, |
| { |
| "grad_norm": 0.8098063468933105, |
| "learning_rate": 0.0001396, |
| "loss": 0.0972, |
| "step": 350 |
| }, |
| { |
| "grad_norm": 0.47828200459480286, |
| "learning_rate": 0.0001436, |
| "loss": 0.0972, |
| "step": 360 |
| }, |
| { |
| "grad_norm": 0.6147670149803162, |
| "learning_rate": 0.0001476, |
| "loss": 0.0921, |
| "step": 370 |
| }, |
| { |
| "grad_norm": 0.630636990070343, |
| "learning_rate": 0.0001516, |
| "loss": 0.0932, |
| "step": 380 |
| }, |
| { |
| "grad_norm": 0.7780483961105347, |
| "learning_rate": 0.00015560000000000001, |
| "loss": 0.0903, |
| "step": 390 |
| }, |
| { |
| "grad_norm": 0.549488365650177, |
| "learning_rate": 0.0001596, |
| "loss": 0.0911, |
| "step": 400 |
| }, |
| { |
| "grad_norm": 0.537593424320221, |
| "learning_rate": 0.0001636, |
| "loss": 0.0923, |
| "step": 410 |
| }, |
| { |
| "grad_norm": 0.35757097601890564, |
| "learning_rate": 0.0001676, |
| "loss": 0.0901, |
| "step": 420 |
| }, |
| { |
| "grad_norm": 0.5301028490066528, |
| "learning_rate": 0.0001716, |
| "loss": 0.0873, |
| "step": 430 |
| }, |
| { |
| "grad_norm": 0.5196937322616577, |
| "learning_rate": 0.0001756, |
| "loss": 0.0868, |
| "step": 440 |
| }, |
| { |
| "grad_norm": 0.5398963093757629, |
| "learning_rate": 0.0001796, |
| "loss": 0.0872, |
| "step": 450 |
| }, |
| { |
| "grad_norm": 0.39748314023017883, |
| "learning_rate": 0.00018360000000000002, |
| "loss": 0.0823, |
| "step": 460 |
| }, |
| { |
| "grad_norm": 0.47773298621177673, |
| "learning_rate": 0.0001876, |
| "loss": 0.0847, |
| "step": 470 |
| }, |
| { |
| "grad_norm": 0.6321383714675903, |
| "learning_rate": 0.0001916, |
| "loss": 0.0846, |
| "step": 480 |
| }, |
| { |
| "grad_norm": 0.43730923533439636, |
| "learning_rate": 0.0001956, |
| "loss": 0.0845, |
| "step": 490 |
| }, |
| { |
| "grad_norm": 0.3578919470310211, |
| "learning_rate": 0.0001996, |
| "loss": 0.0842, |
| "step": 500 |
| }, |
| { |
| "grad_norm": 0.5095338821411133, |
| "learning_rate": 0.000199999557098412, |
| "loss": 0.0819, |
| "step": 510 |
| }, |
| { |
| "grad_norm": 0.5493738651275635, |
| "learning_rate": 0.0001999980260856137, |
| "loss": 0.08, |
| "step": 520 |
| }, |
| { |
| "grad_norm": 0.42955926060676575, |
| "learning_rate": 0.00019999540151042328, |
| "loss": 0.0803, |
| "step": 530 |
| }, |
| { |
| "grad_norm": 0.42419669032096863, |
| "learning_rate": 0.0001999916834015426, |
| "loss": 0.0768, |
| "step": 540 |
| }, |
| { |
| "grad_norm": 0.3114933967590332, |
| "learning_rate": 0.0001999868717996323, |
| "loss": 0.0746, |
| "step": 550 |
| }, |
| { |
| "grad_norm": 0.4427943825721741, |
| "learning_rate": 0.00019998096675731135, |
| "loss": 0.0729, |
| "step": 560 |
| }, |
| { |
| "grad_norm": 0.2511034309864044, |
| "learning_rate": 0.0001999739683391563, |
| "loss": 0.0728, |
| "step": 570 |
| }, |
| { |
| "grad_norm": 0.30617302656173706, |
| "learning_rate": 0.00019996587662170074, |
| "loss": 0.0762, |
| "step": 580 |
| }, |
| { |
| "grad_norm": 0.3502658009529114, |
| "learning_rate": 0.00019995669169343438, |
| "loss": 0.0751, |
| "step": 590 |
| }, |
| { |
| "grad_norm": 0.3593088984489441, |
| "learning_rate": 0.00019994641365480214, |
| "loss": 0.0724, |
| "step": 600 |
| }, |
| { |
| "grad_norm": 0.3959539234638214, |
| "learning_rate": 0.00019993504261820298, |
| "loss": 0.0689, |
| "step": 610 |
| }, |
| { |
| "grad_norm": 0.40369659662246704, |
| "learning_rate": 0.0001999225787079888, |
| "loss": 0.0717, |
| "step": 620 |
| }, |
| { |
| "grad_norm": 0.25377678871154785, |
| "learning_rate": 0.00019990902206046287, |
| "loss": 0.07, |
| "step": 630 |
| }, |
| { |
| "grad_norm": 0.453866571187973, |
| "learning_rate": 0.00019989437282387856, |
| "loss": 0.0685, |
| "step": 640 |
| }, |
| { |
| "grad_norm": 0.4460633099079132, |
| "learning_rate": 0.00019987863115843748, |
| "loss": 0.0704, |
| "step": 650 |
| }, |
| { |
| "grad_norm": 0.35564717650413513, |
| "learning_rate": 0.00019986179723628804, |
| "loss": 0.0708, |
| "step": 660 |
| }, |
| { |
| "grad_norm": 0.256737619638443, |
| "learning_rate": 0.00019984387124152332, |
| "loss": 0.0682, |
| "step": 670 |
| }, |
| { |
| "grad_norm": 0.3440409302711487, |
| "learning_rate": 0.00019982485337017908, |
| "loss": 0.0672, |
| "step": 680 |
| }, |
| { |
| "grad_norm": 0.285584419965744, |
| "learning_rate": 0.00019980474383023174, |
| "loss": 0.0658, |
| "step": 690 |
| }, |
| { |
| "grad_norm": 0.2572714388370514, |
| "learning_rate": 0.00019978354284159604, |
| "loss": 0.065, |
| "step": 700 |
| }, |
| { |
| "grad_norm": 0.234798863530159, |
| "learning_rate": 0.00019976125063612252, |
| "loss": 0.0638, |
| "step": 710 |
| }, |
| { |
| "grad_norm": 0.30621594190597534, |
| "learning_rate": 0.00019973786745759525, |
| "loss": 0.0655, |
| "step": 720 |
| }, |
| { |
| "grad_norm": 0.3442394733428955, |
| "learning_rate": 0.00019971339356172885, |
| "loss": 0.0639, |
| "step": 730 |
| }, |
| { |
| "grad_norm": 0.3672217130661011, |
| "learning_rate": 0.00019968782921616596, |
| "loss": 0.0649, |
| "step": 740 |
| }, |
| { |
| "grad_norm": 0.2763133943080902, |
| "learning_rate": 0.00019966117470047418, |
| "loss": 0.0624, |
| "step": 750 |
| }, |
| { |
| "grad_norm": 0.3460654616355896, |
| "learning_rate": 0.000199633430306143, |
| "loss": 0.0614, |
| "step": 760 |
| }, |
| { |
| "grad_norm": 0.3609948456287384, |
| "learning_rate": 0.00019960459633658068, |
| "loss": 0.0612, |
| "step": 770 |
| }, |
| { |
| "grad_norm": 0.3908577263355255, |
| "learning_rate": 0.00019957467310711087, |
| "loss": 0.0596, |
| "step": 780 |
| }, |
| { |
| "grad_norm": 0.25239288806915283, |
| "learning_rate": 0.00019954366094496927, |
| "loss": 0.0612, |
| "step": 790 |
| }, |
| { |
| "grad_norm": 0.1670553982257843, |
| "learning_rate": 0.00019951156018929985, |
| "loss": 0.0645, |
| "step": 800 |
| }, |
| { |
| "grad_norm": 0.3140500485897064, |
| "learning_rate": 0.0001994783711911514, |
| "loss": 0.0582, |
| "step": 810 |
| }, |
| { |
| "grad_norm": 0.2637600898742676, |
| "learning_rate": 0.00019944409431347338, |
| "loss": 0.0608, |
| "step": 820 |
| }, |
| { |
| "grad_norm": 0.35186314582824707, |
| "learning_rate": 0.00019940872993111234, |
| "loss": 0.0602, |
| "step": 830 |
| }, |
| { |
| "grad_norm": 0.24156039953231812, |
| "learning_rate": 0.00019937227843080745, |
| "loss": 0.0597, |
| "step": 840 |
| }, |
| { |
| "grad_norm": 0.3593619465827942, |
| "learning_rate": 0.00019933474021118652, |
| "loss": 0.0582, |
| "step": 850 |
| }, |
| { |
| "grad_norm": 0.23397046327590942, |
| "learning_rate": 0.00019929611568276145, |
| "loss": 0.0583, |
| "step": 860 |
| }, |
| { |
| "grad_norm": 0.312755823135376, |
| "learning_rate": 0.0001992564052679239, |
| "loss": 0.0575, |
| "step": 870 |
| }, |
| { |
| "grad_norm": 0.3655824065208435, |
| "learning_rate": 0.00019921560940094066, |
| "loss": 0.0583, |
| "step": 880 |
| }, |
| { |
| "grad_norm": 0.46782079339027405, |
| "learning_rate": 0.00019917372852794874, |
| "loss": 0.0581, |
| "step": 890 |
| }, |
| { |
| "grad_norm": 0.25634610652923584, |
| "learning_rate": 0.00019913076310695068, |
| "loss": 0.0565, |
| "step": 900 |
| }, |
| { |
| "grad_norm": 0.4292739927768707, |
| "learning_rate": 0.0001990867136078094, |
| "loss": 0.054, |
| "step": 910 |
| }, |
| { |
| "grad_norm": 0.3035515546798706, |
| "learning_rate": 0.00019904158051224324, |
| "loss": 0.0564, |
| "step": 920 |
| }, |
| { |
| "grad_norm": 0.261796772480011, |
| "learning_rate": 0.00019899536431382045, |
| "loss": 0.0552, |
| "step": 930 |
| }, |
| { |
| "grad_norm": 0.24954535067081451, |
| "learning_rate": 0.000198948065517954, |
| "loss": 0.0551, |
| "step": 940 |
| }, |
| { |
| "grad_norm": 0.3103758990764618, |
| "learning_rate": 0.00019889968464189588, |
| "loss": 0.0558, |
| "step": 950 |
| }, |
| { |
| "grad_norm": 0.24562659859657288, |
| "learning_rate": 0.00019885022221473168, |
| "loss": 0.0555, |
| "step": 960 |
| }, |
| { |
| "grad_norm": 0.19912436604499817, |
| "learning_rate": 0.00019879967877737452, |
| "loss": 0.0528, |
| "step": 970 |
| }, |
| { |
| "grad_norm": 0.27990421652793884, |
| "learning_rate": 0.0001987480548825594, |
| "loss": 0.0519, |
| "step": 980 |
| }, |
| { |
| "grad_norm": 0.2675786018371582, |
| "learning_rate": 0.0001986953510948369, |
| "loss": 0.054, |
| "step": 990 |
| }, |
| { |
| "grad_norm": 0.26553845405578613, |
| "learning_rate": 0.00019864156799056723, |
| "loss": 0.0547, |
| "step": 1000 |
| }, |
| { |
| "grad_norm": 0.27253258228302, |
| "learning_rate": 0.00019858670615791377, |
| "loss": 0.0544, |
| "step": 1010 |
| }, |
| { |
| "grad_norm": 0.2688347399234772, |
| "learning_rate": 0.00019853076619683678, |
| "loss": 0.0518, |
| "step": 1020 |
| }, |
| { |
| "grad_norm": 0.2039746195077896, |
| "learning_rate": 0.00019847374871908668, |
| "loss": 0.0516, |
| "step": 1030 |
| }, |
| { |
| "grad_norm": 0.3064139783382416, |
| "learning_rate": 0.00019841565434819747, |
| "loss": 0.0506, |
| "step": 1040 |
| }, |
| { |
| "grad_norm": 0.24837756156921387, |
| "learning_rate": 0.00019835648371947987, |
| "loss": 0.0499, |
| "step": 1050 |
| }, |
| { |
| "grad_norm": 0.4394986629486084, |
| "learning_rate": 0.00019829623748001445, |
| "loss": 0.0502, |
| "step": 1060 |
| }, |
| { |
| "grad_norm": 0.1843259632587433, |
| "learning_rate": 0.00019823491628864436, |
| "loss": 0.0489, |
| "step": 1070 |
| }, |
| { |
| "grad_norm": 0.19856247305870056, |
| "learning_rate": 0.0001981725208159684, |
| "loss": 0.0482, |
| "step": 1080 |
| }, |
| { |
| "grad_norm": 0.24292084574699402, |
| "learning_rate": 0.0001981090517443334, |
| "loss": 0.0481, |
| "step": 1090 |
| }, |
| { |
| "grad_norm": 0.168592169880867, |
| "learning_rate": 0.000198044509767827, |
| "loss": 0.0488, |
| "step": 1100 |
| }, |
| { |
| "grad_norm": 0.21119219064712524, |
| "learning_rate": 0.0001979788955922699, |
| "loss": 0.0483, |
| "step": 1110 |
| }, |
| { |
| "grad_norm": 0.2026098072528839, |
| "learning_rate": 0.0001979122099352082, |
| "loss": 0.0506, |
| "step": 1120 |
| }, |
| { |
| "grad_norm": 0.3283786177635193, |
| "learning_rate": 0.0001978444535259056, |
| "loss": 0.0487, |
| "step": 1130 |
| }, |
| { |
| "grad_norm": 0.28264889121055603, |
| "learning_rate": 0.00019777562710533527, |
| "loss": 0.048, |
| "step": 1140 |
| }, |
| { |
| "grad_norm": 0.15788021683692932, |
| "learning_rate": 0.00019770573142617197, |
| "loss": 0.0485, |
| "step": 1150 |
| }, |
| { |
| "grad_norm": 0.12264436483383179, |
| "learning_rate": 0.00019763476725278363, |
| "loss": 0.0453, |
| "step": 1160 |
| }, |
| { |
| "grad_norm": 0.2884804606437683, |
| "learning_rate": 0.0001975627353612231, |
| "loss": 0.046, |
| "step": 1170 |
| }, |
| { |
| "grad_norm": 0.2817215919494629, |
| "learning_rate": 0.00019748963653921957, |
| "loss": 0.0455, |
| "step": 1180 |
| }, |
| { |
| "grad_norm": 0.27066612243652344, |
| "learning_rate": 0.00019741547158617006, |
| "loss": 0.0472, |
| "step": 1190 |
| }, |
| { |
| "grad_norm": 0.2489967942237854, |
| "learning_rate": 0.00019734024131313067, |
| "loss": 0.0448, |
| "step": 1200 |
| }, |
| { |
| "grad_norm": 0.3080730140209198, |
| "learning_rate": 0.00019726394654280752, |
| "loss": 0.0465, |
| "step": 1210 |
| }, |
| { |
| "grad_norm": 0.29908615350723267, |
| "learning_rate": 0.00019718658810954806, |
| "loss": 0.046, |
| "step": 1220 |
| }, |
| { |
| "grad_norm": 0.266740083694458, |
| "learning_rate": 0.0001971081668593317, |
| "loss": 0.0462, |
| "step": 1230 |
| }, |
| { |
| "grad_norm": 0.1910102367401123, |
| "learning_rate": 0.00019702868364976066, |
| "loss": 0.044, |
| "step": 1240 |
| }, |
| { |
| "grad_norm": 0.18283650279045105, |
| "learning_rate": 0.0001969481393500506, |
| "loss": 0.0456, |
| "step": 1250 |
| }, |
| { |
| "grad_norm": 0.2236376255750656, |
| "learning_rate": 0.0001968665348410211, |
| "loss": 0.0438, |
| "step": 1260 |
| }, |
| { |
| "grad_norm": 0.22641244530677795, |
| "learning_rate": 0.00019678387101508595, |
| "loss": 0.046, |
| "step": 1270 |
| }, |
| { |
| "grad_norm": 0.24328957498073578, |
| "learning_rate": 0.00019670014877624353, |
| "loss": 0.0449, |
| "step": 1280 |
| }, |
| { |
| "grad_norm": 0.13703007996082306, |
| "learning_rate": 0.00019661536904006682, |
| "loss": 0.044, |
| "step": 1290 |
| }, |
| { |
| "grad_norm": 0.23394407331943512, |
| "learning_rate": 0.00019652953273369342, |
| "loss": 0.0434, |
| "step": 1300 |
| }, |
| { |
| "grad_norm": 0.1836947351694107, |
| "learning_rate": 0.00019644264079581546, |
| "loss": 0.0426, |
| "step": 1310 |
| }, |
| { |
| "grad_norm": 0.2846074104309082, |
| "learning_rate": 0.00019635469417666923, |
| "loss": 0.0418, |
| "step": 1320 |
| }, |
| { |
| "grad_norm": 0.2773747742176056, |
| "learning_rate": 0.00019626569383802487, |
| "loss": 0.0469, |
| "step": 1330 |
| }, |
| { |
| "grad_norm": 0.21178005635738373, |
| "learning_rate": 0.00019617564075317584, |
| "loss": 0.0418, |
| "step": 1340 |
| }, |
| { |
| "grad_norm": 0.1820414513349533, |
| "learning_rate": 0.00019608453590692822, |
| "loss": 0.044, |
| "step": 1350 |
| }, |
| { |
| "grad_norm": 0.21951481699943542, |
| "learning_rate": 0.00019599238029559005, |
| "loss": 0.0422, |
| "step": 1360 |
| }, |
| { |
| "grad_norm": 0.24982789158821106, |
| "learning_rate": 0.00019589917492696026, |
| "loss": 0.043, |
| "step": 1370 |
| }, |
| { |
| "grad_norm": 0.1591809093952179, |
| "learning_rate": 0.00019580492082031792, |
| "loss": 0.042, |
| "step": 1380 |
| }, |
| { |
| "grad_norm": 0.23038744926452637, |
| "learning_rate": 0.00019570961900641076, |
| "loss": 0.0427, |
| "step": 1390 |
| }, |
| { |
| "grad_norm": 0.31900501251220703, |
| "learning_rate": 0.0001956132705274442, |
| "loss": 0.0415, |
| "step": 1400 |
| }, |
| { |
| "grad_norm": 0.3238053619861603, |
| "learning_rate": 0.00019551587643706976, |
| "loss": 0.0413, |
| "step": 1410 |
| }, |
| { |
| "grad_norm": 0.2005499303340912, |
| "learning_rate": 0.0001954174378003736, |
| "loss": 0.0412, |
| "step": 1420 |
| }, |
| { |
| "grad_norm": 0.19018618762493134, |
| "learning_rate": 0.00019531795569386487, |
| "loss": 0.0393, |
| "step": 1430 |
| }, |
| { |
| "grad_norm": 0.30942776799201965, |
| "learning_rate": 0.00019521743120546394, |
| "loss": 0.0419, |
| "step": 1440 |
| }, |
| { |
| "grad_norm": 0.2342258095741272, |
| "learning_rate": 0.00019511586543449053, |
| "loss": 0.0409, |
| "step": 1450 |
| }, |
| { |
| "grad_norm": 0.18405261635780334, |
| "learning_rate": 0.00019501325949165167, |
| "loss": 0.0407, |
| "step": 1460 |
| }, |
| { |
| "grad_norm": 0.2921549379825592, |
| "learning_rate": 0.00019490961449902946, |
| "loss": 0.041, |
| "step": 1470 |
| }, |
| { |
| "grad_norm": 0.18793050944805145, |
| "learning_rate": 0.00019480493159006895, |
| "loss": 0.0434, |
| "step": 1480 |
| }, |
| { |
| "grad_norm": 0.21637773513793945, |
| "learning_rate": 0.0001946992119095657, |
| "loss": 0.0407, |
| "step": 1490 |
| }, |
| { |
| "grad_norm": 0.2454444169998169, |
| "learning_rate": 0.00019459245661365313, |
| "loss": 0.0399, |
| "step": 1500 |
| }, |
| { |
| "grad_norm": 0.14031817018985748, |
| "learning_rate": 0.00019448466686979008, |
| "loss": 0.0374, |
| "step": 1510 |
| }, |
| { |
| "grad_norm": 0.26323285698890686, |
| "learning_rate": 0.00019437584385674793, |
| "loss": 0.0397, |
| "step": 1520 |
| }, |
| { |
| "grad_norm": 0.20086735486984253, |
| "learning_rate": 0.00019426598876459773, |
| "loss": 0.0395, |
| "step": 1530 |
| }, |
| { |
| "grad_norm": 0.2568920850753784, |
| "learning_rate": 0.0001941551027946971, |
| "loss": 0.0422, |
| "step": 1540 |
| }, |
| { |
| "grad_norm": 0.11995324492454529, |
| "learning_rate": 0.00019404318715967732, |
| "loss": 0.0398, |
| "step": 1550 |
| }, |
| { |
| "grad_norm": 0.17859072983264923, |
| "learning_rate": 0.00019393024308342983, |
| "loss": 0.039, |
| "step": 1560 |
| }, |
| { |
| "grad_norm": 0.2049851417541504, |
| "learning_rate": 0.0001938162718010929, |
| "loss": 0.0371, |
| "step": 1570 |
| }, |
| { |
| "grad_norm": 0.1736784279346466, |
| "learning_rate": 0.00019370127455903827, |
| "loss": 0.0393, |
| "step": 1580 |
| }, |
| { |
| "grad_norm": 0.16930192708969116, |
| "learning_rate": 0.0001935852526148573, |
| "loss": 0.0382, |
| "step": 1590 |
| }, |
| { |
| "grad_norm": 0.23014754056930542, |
| "learning_rate": 0.00019346820723734745, |
| "loss": 0.0401, |
| "step": 1600 |
| }, |
| { |
| "grad_norm": 0.27679961919784546, |
| "learning_rate": 0.00019335013970649817, |
| "loss": 0.0377, |
| "step": 1610 |
| }, |
| { |
| "grad_norm": 0.3353395462036133, |
| "learning_rate": 0.0001932310513134771, |
| "loss": 0.0411, |
| "step": 1620 |
| }, |
| { |
| "grad_norm": 0.2344261258840561, |
| "learning_rate": 0.00019311094336061578, |
| "loss": 0.0396, |
| "step": 1630 |
| }, |
| { |
| "grad_norm": 0.24594134092330933, |
| "learning_rate": 0.00019298981716139553, |
| "loss": 0.0395, |
| "step": 1640 |
| }, |
| { |
| "grad_norm": 0.16762520372867584, |
| "learning_rate": 0.00019286767404043316, |
| "loss": 0.0366, |
| "step": 1650 |
| }, |
| { |
| "grad_norm": 0.24015094339847565, |
| "learning_rate": 0.00019274451533346615, |
| "loss": 0.0358, |
| "step": 1660 |
| }, |
| { |
| "grad_norm": 0.1919541358947754, |
| "learning_rate": 0.00019262034238733843, |
| "loss": 0.0367, |
| "step": 1670 |
| }, |
| { |
| "grad_norm": 0.13164806365966797, |
| "learning_rate": 0.00019249515655998545, |
| "loss": 0.0348, |
| "step": 1680 |
| }, |
| { |
| "grad_norm": 0.23270496726036072, |
| "learning_rate": 0.00019236895922041943, |
| "loss": 0.0383, |
| "step": 1690 |
| }, |
| { |
| "grad_norm": 0.16496695578098297, |
| "learning_rate": 0.00019224175174871415, |
| "loss": 0.0383, |
| "step": 1700 |
| }, |
| { |
| "grad_norm": 0.17377837002277374, |
| "learning_rate": 0.00019211353553599017, |
| "loss": 0.0371, |
| "step": 1710 |
| }, |
| { |
| "grad_norm": 0.1682979315519333, |
| "learning_rate": 0.00019198431198439947, |
| "loss": 0.0383, |
| "step": 1720 |
| }, |
| { |
| "grad_norm": 0.2559478282928467, |
| "learning_rate": 0.0001918540825071101, |
| "loss": 0.0374, |
| "step": 1730 |
| }, |
| { |
| "grad_norm": 0.16148176789283752, |
| "learning_rate": 0.00019172284852829075, |
| "loss": 0.0346, |
| "step": 1740 |
| }, |
| { |
| "grad_norm": 0.21018460392951965, |
| "learning_rate": 0.0001915906114830952, |
| "loss": 0.0354, |
| "step": 1750 |
| }, |
| { |
| "grad_norm": 0.1702902913093567, |
| "learning_rate": 0.00019145737281764657, |
| "loss": 0.0365, |
| "step": 1760 |
| }, |
| { |
| "grad_norm": 0.1441521942615509, |
| "learning_rate": 0.0001913231339890216, |
| "loss": 0.0366, |
| "step": 1770 |
| }, |
| { |
| "grad_norm": 0.17645713686943054, |
| "learning_rate": 0.0001911878964652346, |
| "loss": 0.0365, |
| "step": 1780 |
| }, |
| { |
| "grad_norm": 0.2643643021583557, |
| "learning_rate": 0.00019105166172522138, |
| "loss": 0.0387, |
| "step": 1790 |
| }, |
| { |
| "grad_norm": 0.17437313497066498, |
| "learning_rate": 0.00019091443125882337, |
| "loss": 0.0391, |
| "step": 1800 |
| }, |
| { |
| "grad_norm": 0.17275704443454742, |
| "learning_rate": 0.00019077620656677085, |
| "loss": 0.036, |
| "step": 1810 |
| }, |
| { |
| "grad_norm": 0.17451389133930206, |
| "learning_rate": 0.00019063698916066697, |
| "loss": 0.0345, |
| "step": 1820 |
| }, |
| { |
| "grad_norm": 0.19119150936603546, |
| "learning_rate": 0.00019049678056297094, |
| "loss": 0.0352, |
| "step": 1830 |
| }, |
| { |
| "grad_norm": 0.17107735574245453, |
| "learning_rate": 0.00019035558230698154, |
| "loss": 0.035, |
| "step": 1840 |
| }, |
| { |
| "grad_norm": 0.19179688394069672, |
| "learning_rate": 0.00019021339593682028, |
| "loss": 0.0351, |
| "step": 1850 |
| }, |
| { |
| "grad_norm": 0.18375959992408752, |
| "learning_rate": 0.00019007022300741454, |
| "loss": 0.0374, |
| "step": 1860 |
| }, |
| { |
| "grad_norm": 0.242448091506958, |
| "learning_rate": 0.00018992606508448049, |
| "loss": 0.0356, |
| "step": 1870 |
| }, |
| { |
| "grad_norm": 0.20058102905750275, |
| "learning_rate": 0.00018978092374450611, |
| "loss": 0.0345, |
| "step": 1880 |
| }, |
| { |
| "grad_norm": 0.14428074657917023, |
| "learning_rate": 0.00018963480057473383, |
| "loss": 0.0343, |
| "step": 1890 |
| }, |
| { |
| "grad_norm": 0.18150261044502258, |
| "learning_rate": 0.00018948769717314328, |
| "loss": 0.0341, |
| "step": 1900 |
| }, |
| { |
| "grad_norm": 0.1219368577003479, |
| "learning_rate": 0.00018933961514843359, |
| "loss": 0.0327, |
| "step": 1910 |
| }, |
| { |
| "grad_norm": 0.24976667761802673, |
| "learning_rate": 0.0001891905561200061, |
| "loss": 0.0338, |
| "step": 1920 |
| }, |
| { |
| "grad_norm": 0.2597788870334625, |
| "learning_rate": 0.0001890405217179465, |
| "loss": 0.0349, |
| "step": 1930 |
| }, |
| { |
| "grad_norm": 0.18376731872558594, |
| "learning_rate": 0.00018888951358300696, |
| "loss": 0.0335, |
| "step": 1940 |
| }, |
| { |
| "grad_norm": 0.20351895689964294, |
| "learning_rate": 0.00018873753336658822, |
| "loss": 0.0344, |
| "step": 1950 |
| }, |
| { |
| "grad_norm": 0.2000509649515152, |
| "learning_rate": 0.00018858458273072158, |
| "loss": 0.0341, |
| "step": 1960 |
| }, |
| { |
| "grad_norm": 0.19135110080242157, |
| "learning_rate": 0.00018843066334805068, |
| "loss": 0.0322, |
| "step": 1970 |
| }, |
| { |
| "grad_norm": 0.1353762149810791, |
| "learning_rate": 0.00018827577690181332, |
| "loss": 0.0338, |
| "step": 1980 |
| }, |
| { |
| "grad_norm": 0.21709775924682617, |
| "learning_rate": 0.00018811992508582272, |
| "loss": 0.0349, |
| "step": 1990 |
| }, |
| { |
| "grad_norm": 0.21062973141670227, |
| "learning_rate": 0.0001879631096044495, |
| "loss": 0.0327, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 320, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|