|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9981916817359855, |
|
"eval_steps": 500, |
|
"global_step": 138, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007233273056057866, |
|
"grad_norm": 5.463847855317702, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.0902, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.014466546112115732, |
|
"grad_norm": 4.681007032225575, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.9968, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0216998191681736, |
|
"grad_norm": 4.218229671072367, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.9226, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.028933092224231464, |
|
"grad_norm": 9.650732793859843, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.896, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03616636528028933, |
|
"grad_norm": 5.578584837598941, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7791, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0433996383363472, |
|
"grad_norm": 4.789830044233945, |
|
"learning_rate": 1.9997210372120276e-05, |
|
"loss": 0.6718, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05063291139240506, |
|
"grad_norm": 4.141804977350294, |
|
"learning_rate": 1.998884304488584e-05, |
|
"loss": 0.6076, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05786618444846293, |
|
"grad_norm": 3.5515048983104056, |
|
"learning_rate": 1.997490268664256e-05, |
|
"loss": 0.4989, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0650994575045208, |
|
"grad_norm": 3.153711457430971, |
|
"learning_rate": 1.995539707507284e-05, |
|
"loss": 0.4152, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07233273056057866, |
|
"grad_norm": 2.9236266977118777, |
|
"learning_rate": 1.9930337092856243e-05, |
|
"loss": 0.4002, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07956600361663653, |
|
"grad_norm": 2.876849628749717, |
|
"learning_rate": 1.9899736721597787e-05, |
|
"loss": 0.395, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0867992766726944, |
|
"grad_norm": 2.4107246576130668, |
|
"learning_rate": 1.9863613034027224e-05, |
|
"loss": 0.3166, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09403254972875226, |
|
"grad_norm": 2.2339458170958726, |
|
"learning_rate": 1.9821986184473757e-05, |
|
"loss": 0.3097, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10126582278481013, |
|
"grad_norm": 2.2197484192147043, |
|
"learning_rate": 1.9774879397621387e-05, |
|
"loss": 0.3013, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10849909584086799, |
|
"grad_norm": 1.9826673969406083, |
|
"learning_rate": 1.9722318955551307e-05, |
|
"loss": 0.3094, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11573236889692586, |
|
"grad_norm": 2.984906540777772, |
|
"learning_rate": 1.966433418307843e-05, |
|
"loss": 0.3331, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12296564195298372, |
|
"grad_norm": 2.2412636771179346, |
|
"learning_rate": 1.960095743139033e-05, |
|
"loss": 0.3285, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1301989150090416, |
|
"grad_norm": 1.857679521391161, |
|
"learning_rate": 1.9532224059997693e-05, |
|
"loss": 0.2672, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.13743218806509946, |
|
"grad_norm": 2.0139486786788434, |
|
"learning_rate": 1.9458172417006347e-05, |
|
"loss": 0.2523, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.14466546112115733, |
|
"grad_norm": 1.7175026487258431, |
|
"learning_rate": 1.9378843817721856e-05, |
|
"loss": 0.2444, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1518987341772152, |
|
"grad_norm": 1.855948744613208, |
|
"learning_rate": 1.929428252159866e-05, |
|
"loss": 0.2489, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.15913200723327306, |
|
"grad_norm": 2.2399762935756637, |
|
"learning_rate": 1.9204535707546602e-05, |
|
"loss": 0.347, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.16636528028933092, |
|
"grad_norm": 1.8979316016809038, |
|
"learning_rate": 1.9109653447608607e-05, |
|
"loss": 0.2244, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1735985533453888, |
|
"grad_norm": 1.8508155059344538, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 0.2151, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.18083182640144665, |
|
"grad_norm": 1.8054619356574957, |
|
"learning_rate": 1.8904697174694447e-05, |
|
"loss": 0.2414, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.18806509945750452, |
|
"grad_norm": 1.633010553246676, |
|
"learning_rate": 1.879473751206489e-05, |
|
"loss": 0.2211, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.19529837251356238, |
|
"grad_norm": 1.774632206684753, |
|
"learning_rate": 1.8679871040443632e-05, |
|
"loss": 0.2194, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.20253164556962025, |
|
"grad_norm": 1.48448462933083, |
|
"learning_rate": 1.8560161846773002e-05, |
|
"loss": 0.1607, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.20976491862567812, |
|
"grad_norm": 1.950497681891261, |
|
"learning_rate": 1.8435676719873828e-05, |
|
"loss": 0.1848, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.21699819168173598, |
|
"grad_norm": 1.646269811867694, |
|
"learning_rate": 1.830648511318223e-05, |
|
"loss": 0.1726, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22423146473779385, |
|
"grad_norm": 1.3277544792281775, |
|
"learning_rate": 1.817265910599978e-05, |
|
"loss": 0.1492, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2314647377938517, |
|
"grad_norm": 1.52978658678254, |
|
"learning_rate": 1.8034273363278615e-05, |
|
"loss": 0.1964, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.23869801084990958, |
|
"grad_norm": 1.8231843149019955, |
|
"learning_rate": 1.789140509396394e-05, |
|
"loss": 0.2086, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.24593128390596744, |
|
"grad_norm": 1.8149511918444887, |
|
"learning_rate": 1.7744134007917195e-05, |
|
"loss": 0.2206, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.25316455696202533, |
|
"grad_norm": 1.7081708732378678, |
|
"learning_rate": 1.7592542271443888e-05, |
|
"loss": 0.2302, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2603978300180832, |
|
"grad_norm": 1.4494531226173395, |
|
"learning_rate": 1.74367144614509e-05, |
|
"loss": 0.1969, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.26763110307414106, |
|
"grad_norm": 1.4435352798148773, |
|
"learning_rate": 1.7276737518258865e-05, |
|
"loss": 0.1662, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.27486437613019893, |
|
"grad_norm": 1.5398089087755158, |
|
"learning_rate": 1.7112700697095955e-05, |
|
"loss": 0.1961, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2820976491862568, |
|
"grad_norm": 1.5371251862624924, |
|
"learning_rate": 1.6944695518300087e-05, |
|
"loss": 0.1824, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.28933092224231466, |
|
"grad_norm": 1.2387761223387177, |
|
"learning_rate": 1.6772815716257414e-05, |
|
"loss": 0.1691, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2965641952983725, |
|
"grad_norm": 1.4819991639273267, |
|
"learning_rate": 1.6597157187105475e-05, |
|
"loss": 0.1817, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3037974683544304, |
|
"grad_norm": 1.1328162832418003, |
|
"learning_rate": 1.6417817935230318e-05, |
|
"loss": 0.1418, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.31103074141048825, |
|
"grad_norm": 1.6609923134511753, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 0.2262, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3182640144665461, |
|
"grad_norm": 1.101183253773407, |
|
"learning_rate": 1.6048499492876378e-05, |
|
"loss": 0.1134, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.325497287522604, |
|
"grad_norm": 1.735418027034885, |
|
"learning_rate": 1.5858726354602248e-05, |
|
"loss": 0.175, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.33273056057866185, |
|
"grad_norm": 1.1925277558480014, |
|
"learning_rate": 1.5665684483052425e-05, |
|
"loss": 0.1415, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3399638336347197, |
|
"grad_norm": 1.7164775271513815, |
|
"learning_rate": 1.5469481581224274e-05, |
|
"loss": 0.2229, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3471971066907776, |
|
"grad_norm": 1.54944673239779, |
|
"learning_rate": 1.527022711573479e-05, |
|
"loss": 0.2009, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.35443037974683544, |
|
"grad_norm": 1.052952493457072, |
|
"learning_rate": 1.50680322557464e-05, |
|
"loss": 0.1244, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3616636528028933, |
|
"grad_norm": 1.2955834096511674, |
|
"learning_rate": 1.4863009810942814e-05, |
|
"loss": 0.1426, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3688969258589512, |
|
"grad_norm": 1.0647479266524689, |
|
"learning_rate": 1.4655274168589635e-05, |
|
"loss": 0.1171, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.37613019891500904, |
|
"grad_norm": 1.3531117302986653, |
|
"learning_rate": 1.444494122971476e-05, |
|
"loss": 0.144, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3833634719710669, |
|
"grad_norm": 1.5396377899251914, |
|
"learning_rate": 1.4232128344444251e-05, |
|
"loss": 0.1426, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.39059674502712477, |
|
"grad_norm": 1.6556906631060258, |
|
"learning_rate": 1.4016954246529697e-05, |
|
"loss": 0.1834, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.39783001808318263, |
|
"grad_norm": 1.4971003634614435, |
|
"learning_rate": 1.37995389871036e-05, |
|
"loss": 0.1696, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4050632911392405, |
|
"grad_norm": 1.3952013105051067, |
|
"learning_rate": 1.3580003867699801e-05, |
|
"loss": 0.148, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.41229656419529837, |
|
"grad_norm": 1.146527628567494, |
|
"learning_rate": 1.3358471372576229e-05, |
|
"loss": 0.1225, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.41952983725135623, |
|
"grad_norm": 1.1979689508816695, |
|
"learning_rate": 1.3135065100377816e-05, |
|
"loss": 0.1629, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4267631103074141, |
|
"grad_norm": 1.361570996632967, |
|
"learning_rate": 1.2909909695177647e-05, |
|
"loss": 0.1762, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.43399638336347196, |
|
"grad_norm": 1.459216322827295, |
|
"learning_rate": 1.268313077693485e-05, |
|
"loss": 0.1823, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4412296564195298, |
|
"grad_norm": 0.9032128659262652, |
|
"learning_rate": 1.2454854871407993e-05, |
|
"loss": 0.1004, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4484629294755877, |
|
"grad_norm": 1.2429585892566153, |
|
"learning_rate": 1.2225209339563144e-05, |
|
"loss": 0.1431, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.45569620253164556, |
|
"grad_norm": 1.2516084012000934, |
|
"learning_rate": 1.1994322306515926e-05, |
|
"loss": 0.1623, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4629294755877034, |
|
"grad_norm": 1.196502381130855, |
|
"learning_rate": 1.176232259004722e-05, |
|
"loss": 0.1329, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4701627486437613, |
|
"grad_norm": 1.3204205518835002, |
|
"learning_rate": 1.1529339628732462e-05, |
|
"loss": 0.1617, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.47739602169981915, |
|
"grad_norm": 1.033121940148869, |
|
"learning_rate": 1.1295503409724526e-05, |
|
"loss": 0.1311, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.484629294755877, |
|
"grad_norm": 1.169007325201895, |
|
"learning_rate": 1.1060944396230583e-05, |
|
"loss": 0.1597, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4918625678119349, |
|
"grad_norm": 1.256316293799304, |
|
"learning_rate": 1.0825793454723325e-05, |
|
"loss": 0.1558, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.49909584086799275, |
|
"grad_norm": 0.9373040893556883, |
|
"learning_rate": 1.0590181781927229e-05, |
|
"loss": 0.1246, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5063291139240507, |
|
"grad_norm": 1.193004996300962, |
|
"learning_rate": 1.0354240831620542e-05, |
|
"loss": 0.1626, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5135623869801085, |
|
"grad_norm": 1.2648937590042812, |
|
"learning_rate": 1.0118102241293848e-05, |
|
"loss": 0.1252, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5207956600361664, |
|
"grad_norm": 1.2964585274228753, |
|
"learning_rate": 9.881897758706155e-06, |
|
"loss": 0.1794, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5280289330922242, |
|
"grad_norm": 1.0311616305251576, |
|
"learning_rate": 9.645759168379463e-06, |
|
"loss": 0.1029, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5352622061482821, |
|
"grad_norm": 1.4796928099723654, |
|
"learning_rate": 9.409818218072774e-06, |
|
"loss": 0.1932, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5424954792043399, |
|
"grad_norm": 1.422580485613523, |
|
"learning_rate": 9.174206545276678e-06, |
|
"loss": 0.138, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5497287522603979, |
|
"grad_norm": 1.2903416870852855, |
|
"learning_rate": 8.93905560376942e-06, |
|
"loss": 0.1405, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5569620253164557, |
|
"grad_norm": 0.8368005903323104, |
|
"learning_rate": 8.704496590275479e-06, |
|
"loss": 0.0839, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5641952983725136, |
|
"grad_norm": 1.462710303908404, |
|
"learning_rate": 8.47066037126754e-06, |
|
"loss": 0.1234, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 1.5099630306756744, |
|
"learning_rate": 8.237677409952784e-06, |
|
"loss": 0.1632, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5786618444846293, |
|
"grad_norm": 1.1554759264435244, |
|
"learning_rate": 8.005677693484077e-06, |
|
"loss": 0.1189, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5858951175406871, |
|
"grad_norm": 0.9519115511119977, |
|
"learning_rate": 7.774790660436857e-06, |
|
"loss": 0.1184, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.593128390596745, |
|
"grad_norm": 1.1753813368542916, |
|
"learning_rate": 7.545145128592009e-06, |
|
"loss": 0.1469, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6003616636528029, |
|
"grad_norm": 0.8852628329465493, |
|
"learning_rate": 7.316869223065156e-06, |
|
"loss": 0.1052, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6075949367088608, |
|
"grad_norm": 1.2158386219830792, |
|
"learning_rate": 7.090090304822356e-06, |
|
"loss": 0.1615, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6148282097649186, |
|
"grad_norm": 1.2467724590478952, |
|
"learning_rate": 6.864934899622191e-06, |
|
"loss": 0.1818, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6220614828209765, |
|
"grad_norm": 1.3636775085007256, |
|
"learning_rate": 6.6415286274237744e-06, |
|
"loss": 0.1395, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6292947558770343, |
|
"grad_norm": 1.282395006899603, |
|
"learning_rate": 6.419996132300203e-06, |
|
"loss": 0.1565, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6365280289330922, |
|
"grad_norm": 1.0471222095326378, |
|
"learning_rate": 6.200461012896401e-06, |
|
"loss": 0.1234, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.64376130198915, |
|
"grad_norm": 1.0937728472618042, |
|
"learning_rate": 5.983045753470308e-06, |
|
"loss": 0.1249, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.650994575045208, |
|
"grad_norm": 0.9151031662804701, |
|
"learning_rate": 5.7678716555557515e-06, |
|
"loss": 0.1127, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6582278481012658, |
|
"grad_norm": 1.0701635271377732, |
|
"learning_rate": 5.5550587702852465e-06, |
|
"loss": 0.1147, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6654611211573237, |
|
"grad_norm": 1.4324884227966928, |
|
"learning_rate": 5.344725831410369e-06, |
|
"loss": 0.1539, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6726943942133815, |
|
"grad_norm": 1.1833796524874158, |
|
"learning_rate": 5.136990189057187e-06, |
|
"loss": 0.1473, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6799276672694394, |
|
"grad_norm": 1.3412949266346563, |
|
"learning_rate": 4.931967744253601e-06, |
|
"loss": 0.1825, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6871609403254972, |
|
"grad_norm": 1.1919515749555813, |
|
"learning_rate": 4.729772884265212e-06, |
|
"loss": 0.1645, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6943942133815552, |
|
"grad_norm": 1.121963526401964, |
|
"learning_rate": 4.530518418775734e-06, |
|
"loss": 0.1669, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.701627486437613, |
|
"grad_norm": 1.106624227413996, |
|
"learning_rate": 4.33431551694758e-06, |
|
"loss": 0.1636, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7088607594936709, |
|
"grad_norm": 1.1060273425542941, |
|
"learning_rate": 4.1412736453977545e-06, |
|
"loss": 0.1634, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7160940325497287, |
|
"grad_norm": 1.0995104844138277, |
|
"learning_rate": 3.9515005071236274e-06, |
|
"loss": 0.1342, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7233273056057866, |
|
"grad_norm": 1.3721525349668775, |
|
"learning_rate": 3.7651019814126656e-06, |
|
"loss": 0.1339, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7305605786618445, |
|
"grad_norm": 1.3147221154801705, |
|
"learning_rate": 3.582182064769687e-06, |
|
"loss": 0.1577, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7377938517179023, |
|
"grad_norm": 1.1356847248978508, |
|
"learning_rate": 3.402842812894529e-06, |
|
"loss": 0.1131, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7450271247739603, |
|
"grad_norm": 1.0016288001929678, |
|
"learning_rate": 3.2271842837425917e-06, |
|
"loss": 0.1225, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7522603978300181, |
|
"grad_norm": 1.3052781470084382, |
|
"learning_rate": 3.0553044816999133e-06, |
|
"loss": 0.1927, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.759493670886076, |
|
"grad_norm": 1.3624560033988535, |
|
"learning_rate": 2.8872993029040506e-06, |
|
"loss": 0.1553, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7667269439421338, |
|
"grad_norm": 1.2446145740957406, |
|
"learning_rate": 2.723262481741138e-06, |
|
"loss": 0.1378, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7739602169981917, |
|
"grad_norm": 1.5537240215768289, |
|
"learning_rate": 2.563285538549104e-06, |
|
"loss": 0.1908, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7811934900542495, |
|
"grad_norm": 1.2200310855744627, |
|
"learning_rate": 2.407457728556115e-06, |
|
"loss": 0.1441, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7884267631103075, |
|
"grad_norm": 1.3767110614355733, |
|
"learning_rate": 2.2558659920828095e-06, |
|
"loss": 0.1794, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7956600361663653, |
|
"grad_norm": 1.3646317595886468, |
|
"learning_rate": 2.1085949060360654e-06, |
|
"loss": 0.1761, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8028933092224232, |
|
"grad_norm": 0.7397893286023046, |
|
"learning_rate": 1.96572663672139e-06, |
|
"loss": 0.1045, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.810126582278481, |
|
"grad_norm": 1.0092396426988175, |
|
"learning_rate": 1.8273408940002202e-06, |
|
"loss": 0.1162, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8173598553345389, |
|
"grad_norm": 1.668265792932794, |
|
"learning_rate": 1.693514886817772e-06, |
|
"loss": 0.1777, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8245931283905967, |
|
"grad_norm": 1.3589428752926316, |
|
"learning_rate": 1.5643232801261731e-06, |
|
"loss": 0.1687, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8318264014466547, |
|
"grad_norm": 0.8841209658700555, |
|
"learning_rate": 1.4398381532270001e-06, |
|
"loss": 0.1079, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8390596745027125, |
|
"grad_norm": 0.8642771280686085, |
|
"learning_rate": 1.3201289595563693e-06, |
|
"loss": 0.1247, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8462929475587704, |
|
"grad_norm": 1.1725446043997025, |
|
"learning_rate": 1.2052624879351105e-06, |
|
"loss": 0.1118, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8535262206148282, |
|
"grad_norm": 0.8430507728068868, |
|
"learning_rate": 1.0953028253055541e-06, |
|
"loss": 0.1154, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8607594936708861, |
|
"grad_norm": 1.1514692393692905, |
|
"learning_rate": 9.903113209758098e-07, |
|
"loss": 0.1643, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8679927667269439, |
|
"grad_norm": 1.0531754579482937, |
|
"learning_rate": 8.903465523913957e-07, |
|
"loss": 0.1195, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8752260397830018, |
|
"grad_norm": 1.1752942657410133, |
|
"learning_rate": 7.954642924533995e-07, |
|
"loss": 0.1576, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8824593128390597, |
|
"grad_norm": 1.5113189404876857, |
|
"learning_rate": 7.057174784013432e-07, |
|
"loss": 0.1954, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.8896925858951176, |
|
"grad_norm": 0.9763145279668668, |
|
"learning_rate": 6.211561822781476e-07, |
|
"loss": 0.1358, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8969258589511754, |
|
"grad_norm": 1.0884981959132525, |
|
"learning_rate": 5.418275829936537e-07, |
|
"loss": 0.1455, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9041591320072333, |
|
"grad_norm": 1.1897400189233847, |
|
"learning_rate": 4.6777594000230855e-07, |
|
"loss": 0.1669, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9113924050632911, |
|
"grad_norm": 1.225645214005749, |
|
"learning_rate": 3.9904256860967436e-07, |
|
"loss": 0.149, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.918625678119349, |
|
"grad_norm": 1.465044305909909, |
|
"learning_rate": 3.356658169215743e-07, |
|
"loss": 0.1481, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9258589511754068, |
|
"grad_norm": 1.2424255190165467, |
|
"learning_rate": 2.776810444486944e-07, |
|
"loss": 0.1579, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9330922242314648, |
|
"grad_norm": 1.2063094148288678, |
|
"learning_rate": 2.2512060237861455e-07, |
|
"loss": 0.1326, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9403254972875226, |
|
"grad_norm": 1.0826240342833202, |
|
"learning_rate": 1.7801381552624565e-07, |
|
"loss": 0.1157, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9475587703435805, |
|
"grad_norm": 1.2012754222257493, |
|
"learning_rate": 1.3638696597277678e-07, |
|
"loss": 0.1554, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9547920433996383, |
|
"grad_norm": 1.0701124216474254, |
|
"learning_rate": 1.0026327840221728e-07, |
|
"loss": 0.1543, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9620253164556962, |
|
"grad_norm": 1.0800203595933195, |
|
"learning_rate": 6.966290714375934e-08, |
|
"loss": 0.1484, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.969258589511754, |
|
"grad_norm": 1.0589626716764733, |
|
"learning_rate": 4.460292492716512e-08, |
|
"loss": 0.1635, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.976491862567812, |
|
"grad_norm": 0.7207109463474409, |
|
"learning_rate": 2.509731335744281e-08, |
|
"loss": 0.0853, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9837251356238698, |
|
"grad_norm": 1.3772819365603755, |
|
"learning_rate": 1.1156955114162149e-08, |
|
"loss": 0.1777, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9909584086799277, |
|
"grad_norm": 1.4747350400123909, |
|
"learning_rate": 2.7896278797256983e-09, |
|
"loss": 0.1516, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9981916817359855, |
|
"grad_norm": 1.3893837186428997, |
|
"learning_rate": 0.0, |
|
"loss": 0.1907, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9981916817359855, |
|
"step": 138, |
|
"total_flos": 283290642284544.0, |
|
"train_loss": 0.2102620561701664, |
|
"train_runtime": 1187.4984, |
|
"train_samples_per_second": 14.899, |
|
"train_steps_per_second": 0.116 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 138, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 283290642284544.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|