|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.983277591973244, |
|
"eval_steps": 37, |
|
"global_step": 222, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013377926421404682, |
|
"grad_norm": 7.5698137283325195, |
|
"learning_rate": 1.3333333333333332e-06, |
|
"loss": 2.1804, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.026755852842809364, |
|
"grad_norm": 7.3767266273498535, |
|
"learning_rate": 2.6666666666666664e-06, |
|
"loss": 2.1927, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04013377926421405, |
|
"grad_norm": 7.829778671264648, |
|
"learning_rate": 4e-06, |
|
"loss": 2.3279, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.05351170568561873, |
|
"grad_norm": 2.793825626373291, |
|
"learning_rate": 5.333333333333333e-06, |
|
"loss": 1.9777, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.06688963210702341, |
|
"grad_norm": 1.4661837816238403, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.8485, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0802675585284281, |
|
"grad_norm": 1.2292248010635376, |
|
"learning_rate": 8e-06, |
|
"loss": 1.9523, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.09364548494983277, |
|
"grad_norm": 1.240803599357605, |
|
"learning_rate": 7.99957692770843e-06, |
|
"loss": 1.9104, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.10702341137123746, |
|
"grad_norm": 0.8672861456871033, |
|
"learning_rate": 7.998307800328803e-06, |
|
"loss": 1.9006, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.12040133779264214, |
|
"grad_norm": 0.7724849581718445, |
|
"learning_rate": 7.996192886327432e-06, |
|
"loss": 1.8721, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.13377926421404682, |
|
"grad_norm": 0.6626549959182739, |
|
"learning_rate": 7.993232633085074e-06, |
|
"loss": 1.8403, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.14715719063545152, |
|
"grad_norm": 0.7850075364112854, |
|
"learning_rate": 7.989427666802289e-06, |
|
"loss": 1.8972, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1605351170568562, |
|
"grad_norm": 0.6829317808151245, |
|
"learning_rate": 7.984778792366982e-06, |
|
"loss": 1.815, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.5757900476455688, |
|
"learning_rate": 7.979286993184132e-06, |
|
"loss": 1.7474, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.18729096989966554, |
|
"grad_norm": 0.5840671062469482, |
|
"learning_rate": 7.972953430967771e-06, |
|
"loss": 1.872, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.20066889632107024, |
|
"grad_norm": 0.6052594780921936, |
|
"learning_rate": 7.965779445495242e-06, |
|
"loss": 1.7793, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2140468227424749, |
|
"grad_norm": 0.5719857215881348, |
|
"learning_rate": 7.957766554323777e-06, |
|
"loss": 1.8001, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.22742474916387959, |
|
"grad_norm": 0.6494969129562378, |
|
"learning_rate": 7.948916452469496e-06, |
|
"loss": 1.8784, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2408026755852843, |
|
"grad_norm": 0.5779961347579956, |
|
"learning_rate": 7.939231012048832e-06, |
|
"loss": 1.8493, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.25418060200668896, |
|
"grad_norm": 0.519511342048645, |
|
"learning_rate": 7.928712281882523e-06, |
|
"loss": 1.8679, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.26755852842809363, |
|
"grad_norm": 0.6307725310325623, |
|
"learning_rate": 7.917362487062206e-06, |
|
"loss": 1.8664, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2809364548494983, |
|
"grad_norm": 0.521139919757843, |
|
"learning_rate": 7.905184028479733e-06, |
|
"loss": 1.7756, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.29431438127090304, |
|
"grad_norm": 0.5131444931030273, |
|
"learning_rate": 7.892179482319294e-06, |
|
"loss": 1.6563, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.563713014125824, |
|
"learning_rate": 7.878351599512464e-06, |
|
"loss": 1.852, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3210702341137124, |
|
"grad_norm": 0.5473136901855469, |
|
"learning_rate": 7.863703305156273e-06, |
|
"loss": 1.8271, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.33444816053511706, |
|
"grad_norm": 0.49893084168434143, |
|
"learning_rate": 7.848237697894452e-06, |
|
"loss": 1.7639, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.47465410828590393, |
|
"learning_rate": 7.831958049261955e-06, |
|
"loss": 1.8612, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.3612040133779264, |
|
"grad_norm": 0.5295856595039368, |
|
"learning_rate": 7.814867802992907e-06, |
|
"loss": 1.819, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3745819397993311, |
|
"grad_norm": 0.4863497316837311, |
|
"learning_rate": 7.796970574292136e-06, |
|
"loss": 1.7617, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3879598662207358, |
|
"grad_norm": 0.5433112978935242, |
|
"learning_rate": 7.778270149070419e-06, |
|
"loss": 1.7289, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4013377926421405, |
|
"grad_norm": 0.5282914638519287, |
|
"learning_rate": 7.758770483143633e-06, |
|
"loss": 1.8131, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.41471571906354515, |
|
"grad_norm": 0.5243386030197144, |
|
"learning_rate": 7.738475701395954e-06, |
|
"loss": 1.8339, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.4280936454849498, |
|
"grad_norm": 0.49295875430107117, |
|
"learning_rate": 7.717390096907289e-06, |
|
"loss": 1.8133, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4414715719063545, |
|
"grad_norm": 0.5231158137321472, |
|
"learning_rate": 7.695518130045147e-06, |
|
"loss": 1.8031, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.45484949832775917, |
|
"grad_norm": 0.5049412250518799, |
|
"learning_rate": 7.672864427521097e-06, |
|
"loss": 1.7918, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4682274247491639, |
|
"grad_norm": 0.5039061903953552, |
|
"learning_rate": 7.649433781412057e-06, |
|
"loss": 1.741, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4816053511705686, |
|
"grad_norm": 0.5041850805282593, |
|
"learning_rate": 7.6252311481465996e-06, |
|
"loss": 1.7254, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.49498327759197325, |
|
"grad_norm": 0.4633885622024536, |
|
"learning_rate": 7.600261647456484e-06, |
|
"loss": 1.8132, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.49498327759197325, |
|
"eval_loss": 0.667718768119812, |
|
"eval_runtime": 13.4399, |
|
"eval_samples_per_second": 90.626, |
|
"eval_steps_per_second": 5.729, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5083612040133779, |
|
"grad_norm": 0.5181726217269897, |
|
"learning_rate": 7.574530561293649e-06, |
|
"loss": 1.882, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.5037977695465088, |
|
"learning_rate": 7.548043332712886e-06, |
|
"loss": 1.8253, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5351170568561873, |
|
"grad_norm": 0.4691613018512726, |
|
"learning_rate": 7.520805564720443e-06, |
|
"loss": 1.7016, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5484949832775919, |
|
"grad_norm": 0.4761461019515991, |
|
"learning_rate": 7.492823019088783e-06, |
|
"loss": 1.8041, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5618729096989966, |
|
"grad_norm": 0.4626379907131195, |
|
"learning_rate": 7.4641016151377545e-06, |
|
"loss": 1.7852, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5752508361204013, |
|
"grad_norm": 0.49921584129333496, |
|
"learning_rate": 7.434647428482453e-06, |
|
"loss": 1.7104, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.5886287625418061, |
|
"grad_norm": 0.49447470903396606, |
|
"learning_rate": 7.4044666897479985e-06, |
|
"loss": 1.7973, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6020066889632107, |
|
"grad_norm": 0.4844650328159332, |
|
"learning_rate": 7.373565783251543e-06, |
|
"loss": 1.7678, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.49107274413108826, |
|
"learning_rate": 7.3419512456517455e-06, |
|
"loss": 1.718, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6287625418060201, |
|
"grad_norm": 0.49630844593048096, |
|
"learning_rate": 7.309629764566041e-06, |
|
"loss": 1.802, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.6421404682274248, |
|
"grad_norm": 0.47247716784477234, |
|
"learning_rate": 7.276608177155967e-06, |
|
"loss": 1.7803, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6555183946488294, |
|
"grad_norm": 1.2681446075439453, |
|
"learning_rate": 7.242893468680849e-06, |
|
"loss": 1.768, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.6688963210702341, |
|
"grad_norm": 0.5203042030334473, |
|
"learning_rate": 7.208492771020175e-06, |
|
"loss": 1.8885, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6822742474916388, |
|
"grad_norm": 0.5055291056632996, |
|
"learning_rate": 7.1734133611649405e-06, |
|
"loss": 1.812, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.5043210387229919, |
|
"learning_rate": 7.137662659678303e-06, |
|
"loss": 1.8291, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7090301003344481, |
|
"grad_norm": 0.5000115633010864, |
|
"learning_rate": 7.1012482291258626e-06, |
|
"loss": 1.7115, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7224080267558528, |
|
"grad_norm": 0.5015853643417358, |
|
"learning_rate": 7.064177772475912e-06, |
|
"loss": 1.8441, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7357859531772575, |
|
"grad_norm": 0.5204277038574219, |
|
"learning_rate": 7.026459131469972e-06, |
|
"loss": 1.8268, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7491638795986622, |
|
"grad_norm": 0.5002334117889404, |
|
"learning_rate": 6.9881002849639835e-06, |
|
"loss": 1.7633, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7625418060200669, |
|
"grad_norm": 0.47437921166419983, |
|
"learning_rate": 6.949109347240496e-06, |
|
"loss": 1.7573, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.7759197324414716, |
|
"grad_norm": 0.46609270572662354, |
|
"learning_rate": 6.909494566292195e-06, |
|
"loss": 1.7671, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.7892976588628763, |
|
"grad_norm": 0.464740514755249, |
|
"learning_rate": 6.869264322077157e-06, |
|
"loss": 1.735, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.802675585284281, |
|
"grad_norm": 0.44125059247016907, |
|
"learning_rate": 6.82842712474619e-06, |
|
"loss": 1.6895, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8160535117056856, |
|
"grad_norm": 0.4398334324359894, |
|
"learning_rate": 6.786991612842619e-06, |
|
"loss": 1.6622, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.8294314381270903, |
|
"grad_norm": 0.5073325037956238, |
|
"learning_rate": 6.744966551474935e-06, |
|
"loss": 1.7893, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.842809364548495, |
|
"grad_norm": 0.4685400128364563, |
|
"learning_rate": 6.702360830462641e-06, |
|
"loss": 1.7377, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.8561872909698997, |
|
"grad_norm": 0.5076963305473328, |
|
"learning_rate": 6.65918346245575e-06, |
|
"loss": 1.8125, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.510735273361206, |
|
"learning_rate": 6.615443581028279e-06, |
|
"loss": 1.8576, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.882943143812709, |
|
"grad_norm": 0.489524781703949, |
|
"learning_rate": 6.571150438746157e-06, |
|
"loss": 1.6699, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.8963210702341137, |
|
"grad_norm": 0.4763016402721405, |
|
"learning_rate": 6.5263134052099895e-06, |
|
"loss": 1.7561, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.9096989966555183, |
|
"grad_norm": 0.4900319278240204, |
|
"learning_rate": 6.480941965073039e-06, |
|
"loss": 1.7364, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.4876985549926758, |
|
"learning_rate": 6.435045716034882e-06, |
|
"loss": 1.7202, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.9364548494983278, |
|
"grad_norm": 0.5038361549377441, |
|
"learning_rate": 6.388634366811145e-06, |
|
"loss": 1.7732, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9498327759197325, |
|
"grad_norm": 0.5064495801925659, |
|
"learning_rate": 6.341717735079762e-06, |
|
"loss": 1.7221, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.9632107023411371, |
|
"grad_norm": 0.48319771885871887, |
|
"learning_rate": 6.294305745404184e-06, |
|
"loss": 1.7169, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.9765886287625418, |
|
"grad_norm": 0.4659038782119751, |
|
"learning_rate": 6.246408427133971e-06, |
|
"loss": 1.7655, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.9899665551839465, |
|
"grad_norm": 0.5067590475082397, |
|
"learning_rate": 6.198035912283224e-06, |
|
"loss": 1.7013, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.9899665551839465, |
|
"eval_loss": 0.6622208952903748, |
|
"eval_runtime": 13.4118, |
|
"eval_samples_per_second": 90.815, |
|
"eval_steps_per_second": 5.741, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.0100334448160535, |
|
"grad_norm": 1.075007438659668, |
|
"learning_rate": 6.149198433387296e-06, |
|
"loss": 2.8527, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0234113712374582, |
|
"grad_norm": 0.5882135629653931, |
|
"learning_rate": 6.09990632133824e-06, |
|
"loss": 1.2937, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.0367892976588629, |
|
"grad_norm": 0.5747379064559937, |
|
"learning_rate": 6.050170003199461e-06, |
|
"loss": 1.3659, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.0501672240802675, |
|
"grad_norm": 0.5323824286460876, |
|
"learning_rate": 6e-06, |
|
"loss": 1.4013, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.0635451505016722, |
|
"grad_norm": 0.5771108865737915, |
|
"learning_rate": 5.94940692450897e-06, |
|
"loss": 1.4312, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 0.6475838422775269, |
|
"learning_rate": 5.898401478990562e-06, |
|
"loss": 1.5826, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0903010033444815, |
|
"grad_norm": 0.592047929763794, |
|
"learning_rate": 5.846994452940136e-06, |
|
"loss": 1.4368, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.1036789297658862, |
|
"grad_norm": 0.7535114884376526, |
|
"learning_rate": 5.795196720801849e-06, |
|
"loss": 1.5116, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.117056856187291, |
|
"grad_norm": 0.586352527141571, |
|
"learning_rate": 5.743019239668317e-06, |
|
"loss": 1.4519, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.1304347826086956, |
|
"grad_norm": 0.617953360080719, |
|
"learning_rate": 5.690473046962798e-06, |
|
"loss": 1.516, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.1438127090301002, |
|
"grad_norm": 0.5882145166397095, |
|
"learning_rate": 5.63756925810437e-06, |
|
"loss": 1.3272, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1571906354515051, |
|
"grad_norm": 0.5454166531562805, |
|
"learning_rate": 5.584319064156627e-06, |
|
"loss": 1.4443, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.1705685618729098, |
|
"grad_norm": 0.5162122249603271, |
|
"learning_rate": 5.530733729460359e-06, |
|
"loss": 1.409, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.1839464882943145, |
|
"grad_norm": 0.5802710056304932, |
|
"learning_rate": 5.476824589250738e-06, |
|
"loss": 1.4925, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.1973244147157192, |
|
"grad_norm": 0.5536505579948425, |
|
"learning_rate": 5.4226030472595064e-06, |
|
"loss": 1.3556, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.2107023411371238, |
|
"grad_norm": 0.5804270505905151, |
|
"learning_rate": 5.368080573302675e-06, |
|
"loss": 1.4867, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2240802675585285, |
|
"grad_norm": 0.5988459587097168, |
|
"learning_rate": 5.3132687008542446e-06, |
|
"loss": 1.5466, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.2374581939799332, |
|
"grad_norm": 0.5100234150886536, |
|
"learning_rate": 5.2581790246064545e-06, |
|
"loss": 1.3042, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.2508361204013378, |
|
"grad_norm": 0.547820508480072, |
|
"learning_rate": 5.2028231980170915e-06, |
|
"loss": 1.4344, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.2642140468227425, |
|
"grad_norm": 0.5059947967529297, |
|
"learning_rate": 5.147212930844361e-06, |
|
"loss": 1.3313, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.2775919732441472, |
|
"grad_norm": 0.5130128860473633, |
|
"learning_rate": 5.091359986669844e-06, |
|
"loss": 1.3825, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.2909698996655519, |
|
"grad_norm": 0.5295710563659668, |
|
"learning_rate": 5.035276180410083e-06, |
|
"loss": 1.3594, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 0.5060495734214783, |
|
"learning_rate": 4.978973375817295e-06, |
|
"loss": 1.3036, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.3177257525083612, |
|
"grad_norm": 0.5988255143165588, |
|
"learning_rate": 4.922463482969761e-06, |
|
"loss": 1.5651, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.3311036789297659, |
|
"grad_norm": 0.4946533739566803, |
|
"learning_rate": 4.8657584557524116e-06, |
|
"loss": 1.333, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.3444816053511706, |
|
"grad_norm": 0.5674816966056824, |
|
"learning_rate": 4.808870289328152e-06, |
|
"loss": 1.4971, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3578595317725752, |
|
"grad_norm": 0.5148370265960693, |
|
"learning_rate": 4.751811017600447e-06, |
|
"loss": 1.4789, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.37123745819398, |
|
"grad_norm": 0.5132340788841248, |
|
"learning_rate": 4.694592710667722e-06, |
|
"loss": 1.3163, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.6035396456718445, |
|
"learning_rate": 4.637227472270091e-06, |
|
"loss": 1.3789, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.3979933110367893, |
|
"grad_norm": 0.550737738609314, |
|
"learning_rate": 4.579727437228986e-06, |
|
"loss": 1.4218, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.411371237458194, |
|
"grad_norm": 0.4987228810787201, |
|
"learning_rate": 4.522104768880207e-06, |
|
"loss": 1.264, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.4247491638795986, |
|
"grad_norm": 0.5271551609039307, |
|
"learning_rate": 4.4643716565009205e-06, |
|
"loss": 1.4445, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.4381270903010033, |
|
"grad_norm": 0.5551120638847351, |
|
"learning_rate": 4.406540312731208e-06, |
|
"loss": 1.5199, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.451505016722408, |
|
"grad_norm": 0.5053355097770691, |
|
"learning_rate": 4.348622970990633e-06, |
|
"loss": 1.3389, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.4648829431438126, |
|
"grad_norm": 0.5177690386772156, |
|
"learning_rate": 4.290631882890443e-06, |
|
"loss": 1.4396, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.4782608695652173, |
|
"grad_norm": 0.522657573223114, |
|
"learning_rate": 4.232579315641903e-06, |
|
"loss": 1.5001, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.491638795986622, |
|
"grad_norm": 0.5218788981437683, |
|
"learning_rate": 4.174477549461344e-06, |
|
"loss": 1.3964, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.491638795986622, |
|
"eval_loss": 0.6926424503326416, |
|
"eval_runtime": 13.4107, |
|
"eval_samples_per_second": 90.823, |
|
"eval_steps_per_second": 5.742, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.5050167224080266, |
|
"grad_norm": 0.513609766960144, |
|
"learning_rate": 4.1163388749724456e-06, |
|
"loss": 1.3159, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.5183946488294313, |
|
"grad_norm": 0.5356954336166382, |
|
"learning_rate": 4.058175590606331e-06, |
|
"loss": 1.4034, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.531772575250836, |
|
"grad_norm": 0.5953348278999329, |
|
"learning_rate": 4e-06, |
|
"loss": 1.4772, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.5451505016722407, |
|
"grad_norm": 0.5957881808280945, |
|
"learning_rate": 3.941824409393669e-06, |
|
"loss": 1.5237, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.5585284280936453, |
|
"grad_norm": 0.4759249687194824, |
|
"learning_rate": 3.883661125027554e-06, |
|
"loss": 1.27, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.57190635451505, |
|
"grad_norm": 0.5109943747520447, |
|
"learning_rate": 3.825522450538656e-06, |
|
"loss": 1.4649, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.585284280936455, |
|
"grad_norm": 0.477067768573761, |
|
"learning_rate": 3.7674206843580965e-06, |
|
"loss": 1.3081, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.5986622073578596, |
|
"grad_norm": 0.505376398563385, |
|
"learning_rate": 3.7093681171095572e-06, |
|
"loss": 1.3395, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.6120401337792643, |
|
"grad_norm": 0.5487738251686096, |
|
"learning_rate": 3.651377029009367e-06, |
|
"loss": 1.529, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.625418060200669, |
|
"grad_norm": 0.5177662968635559, |
|
"learning_rate": 3.5934596872687923e-06, |
|
"loss": 1.291, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.6387959866220736, |
|
"grad_norm": 0.5494332909584045, |
|
"learning_rate": 3.5356283434990783e-06, |
|
"loss": 1.4541, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.6521739130434783, |
|
"grad_norm": 0.526679277420044, |
|
"learning_rate": 3.4778952311197945e-06, |
|
"loss": 1.3409, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.665551839464883, |
|
"grad_norm": 0.5405285954475403, |
|
"learning_rate": 3.4202725627710133e-06, |
|
"loss": 1.4129, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.6789297658862876, |
|
"grad_norm": 0.5305699110031128, |
|
"learning_rate": 3.36277252772991e-06, |
|
"loss": 1.458, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 0.5097222924232483, |
|
"learning_rate": 3.3054072893322785e-06, |
|
"loss": 1.3555, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.705685618729097, |
|
"grad_norm": 0.5460465550422668, |
|
"learning_rate": 3.2481889823995524e-06, |
|
"loss": 1.3272, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.7190635451505016, |
|
"grad_norm": 0.5369409918785095, |
|
"learning_rate": 3.191129710671849e-06, |
|
"loss": 1.2991, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.7324414715719063, |
|
"grad_norm": 0.4934872090816498, |
|
"learning_rate": 3.1342415442475885e-06, |
|
"loss": 1.319, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.745819397993311, |
|
"grad_norm": 0.5528122186660767, |
|
"learning_rate": 3.077536517030239e-06, |
|
"loss": 1.4731, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.7591973244147159, |
|
"grad_norm": 0.5233715176582336, |
|
"learning_rate": 3.0210266241827046e-06, |
|
"loss": 1.4089, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.7725752508361206, |
|
"grad_norm": 0.5456512570381165, |
|
"learning_rate": 2.9647238195899164e-06, |
|
"loss": 1.4056, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.7859531772575252, |
|
"grad_norm": 0.5461183190345764, |
|
"learning_rate": 2.908640013330157e-06, |
|
"loss": 1.4384, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.79933110367893, |
|
"grad_norm": 0.5198376178741455, |
|
"learning_rate": 2.85278706915564e-06, |
|
"loss": 1.48, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.8127090301003346, |
|
"grad_norm": 0.5073018670082092, |
|
"learning_rate": 2.7971768019829077e-06, |
|
"loss": 1.3335, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.8260869565217392, |
|
"grad_norm": 0.5290614366531372, |
|
"learning_rate": 2.741820975393546e-06, |
|
"loss": 1.4239, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.839464882943144, |
|
"grad_norm": 0.5525597333908081, |
|
"learning_rate": 2.686731299145756e-06, |
|
"loss": 1.4017, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.8528428093645486, |
|
"grad_norm": 0.5554612874984741, |
|
"learning_rate": 2.631919426697325e-06, |
|
"loss": 1.5295, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.8662207357859533, |
|
"grad_norm": 0.5276882648468018, |
|
"learning_rate": 2.5773969527404945e-06, |
|
"loss": 1.3571, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.879598662207358, |
|
"grad_norm": 0.506417453289032, |
|
"learning_rate": 2.5231754107492627e-06, |
|
"loss": 1.3666, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8929765886287626, |
|
"grad_norm": 0.535830557346344, |
|
"learning_rate": 2.469266270539641e-06, |
|
"loss": 1.5119, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.9063545150501673, |
|
"grad_norm": 0.5409619808197021, |
|
"learning_rate": 2.4156809358433725e-06, |
|
"loss": 1.4349, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.919732441471572, |
|
"grad_norm": 0.5141175985336304, |
|
"learning_rate": 2.3624307418956294e-06, |
|
"loss": 1.3672, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.9331103678929766, |
|
"grad_norm": 0.5471431612968445, |
|
"learning_rate": 2.309526953037203e-06, |
|
"loss": 1.3575, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.9464882943143813, |
|
"grad_norm": 0.5435221195220947, |
|
"learning_rate": 2.256980760331683e-06, |
|
"loss": 1.4398, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.959866220735786, |
|
"grad_norm": 0.5480269193649292, |
|
"learning_rate": 2.2048032791981513e-06, |
|
"loss": 1.279, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.9732441471571907, |
|
"grad_norm": 0.5423163175582886, |
|
"learning_rate": 2.153005547059865e-06, |
|
"loss": 1.4763, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.9866220735785953, |
|
"grad_norm": 0.5127543807029724, |
|
"learning_rate": 2.1015985210094384e-06, |
|
"loss": 1.3808, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.9866220735785953, |
|
"eval_loss": 0.7046768069267273, |
|
"eval_runtime": 13.4302, |
|
"eval_samples_per_second": 90.691, |
|
"eval_steps_per_second": 5.733, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.0066889632107023, |
|
"grad_norm": 1.2160006761550903, |
|
"learning_rate": 2.050593075491031e-06, |
|
"loss": 2.3417, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.020066889632107, |
|
"grad_norm": 0.7894352674484253, |
|
"learning_rate": 2.0000000000000008e-06, |
|
"loss": 1.1381, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0334448160535117, |
|
"grad_norm": 0.790090799331665, |
|
"learning_rate": 1.9498299968005392e-06, |
|
"loss": 1.2723, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.0468227424749164, |
|
"grad_norm": 0.6597563624382019, |
|
"learning_rate": 1.9000936786617597e-06, |
|
"loss": 1.1166, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.060200668896321, |
|
"grad_norm": 0.6646838784217834, |
|
"learning_rate": 1.850801566612704e-06, |
|
"loss": 1.2064, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.0735785953177257, |
|
"grad_norm": 0.616726815700531, |
|
"learning_rate": 1.801964087716776e-06, |
|
"loss": 1.1856, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.0869565217391304, |
|
"grad_norm": 0.6225292682647705, |
|
"learning_rate": 1.7535915728660289e-06, |
|
"loss": 1.0631, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.100334448160535, |
|
"grad_norm": 0.9003037214279175, |
|
"learning_rate": 1.7056942545958167e-06, |
|
"loss": 1.1332, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.1137123745819397, |
|
"grad_norm": 1.1400443315505981, |
|
"learning_rate": 1.6582822649202379e-06, |
|
"loss": 1.0256, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.1270903010033444, |
|
"grad_norm": 1.0252878665924072, |
|
"learning_rate": 1.611365633188856e-06, |
|
"loss": 1.0825, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.140468227424749, |
|
"grad_norm": 0.785536527633667, |
|
"learning_rate": 1.5649542839651173e-06, |
|
"loss": 1.1539, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.6228716373443604, |
|
"learning_rate": 1.5190580349269603e-06, |
|
"loss": 1.1693, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.1672240802675584, |
|
"grad_norm": 0.605522096157074, |
|
"learning_rate": 1.4736865947900103e-06, |
|
"loss": 1.0761, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.180602006688963, |
|
"grad_norm": 0.6409484148025513, |
|
"learning_rate": 1.4288495612538425e-06, |
|
"loss": 1.1637, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.1939799331103678, |
|
"grad_norm": 0.6087141036987305, |
|
"learning_rate": 1.3845564189717216e-06, |
|
"loss": 1.0937, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.2073578595317724, |
|
"grad_norm": 0.7026439309120178, |
|
"learning_rate": 1.3408165375442484e-06, |
|
"loss": 1.2132, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.220735785953177, |
|
"grad_norm": 0.6594187617301941, |
|
"learning_rate": 1.297639169537359e-06, |
|
"loss": 1.07, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.234113712374582, |
|
"grad_norm": 0.6606442928314209, |
|
"learning_rate": 1.255033448525066e-06, |
|
"loss": 1.1694, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.2474916387959865, |
|
"grad_norm": 0.6164308786392212, |
|
"learning_rate": 1.2130083871573812e-06, |
|
"loss": 1.0824, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.260869565217391, |
|
"grad_norm": 0.5823544263839722, |
|
"learning_rate": 1.1715728752538101e-06, |
|
"loss": 1.1106, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.274247491638796, |
|
"grad_norm": 0.5872677564620972, |
|
"learning_rate": 1.130735677922842e-06, |
|
"loss": 1.1056, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.2876254180602005, |
|
"grad_norm": 0.6060868501663208, |
|
"learning_rate": 1.090505433707805e-06, |
|
"loss": 1.0993, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.3010033444816056, |
|
"grad_norm": 0.7028762698173523, |
|
"learning_rate": 1.050890652759504e-06, |
|
"loss": 1.3655, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.3143812709030103, |
|
"grad_norm": 0.6295290589332581, |
|
"learning_rate": 1.0118997150360166e-06, |
|
"loss": 1.0931, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.327759197324415, |
|
"grad_norm": 0.6583987474441528, |
|
"learning_rate": 9.735408685300286e-07, |
|
"loss": 1.1103, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.3411371237458196, |
|
"grad_norm": 0.7007333040237427, |
|
"learning_rate": 9.358222275240884e-07, |
|
"loss": 1.1794, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.3545150501672243, |
|
"grad_norm": 0.6823887825012207, |
|
"learning_rate": 8.987517708741363e-07, |
|
"loss": 1.1575, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.367892976588629, |
|
"grad_norm": 0.680305540561676, |
|
"learning_rate": 8.623373403216971e-07, |
|
"loss": 1.096, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.3812709030100336, |
|
"grad_norm": 0.6234930157661438, |
|
"learning_rate": 8.265866388350598e-07, |
|
"loss": 1.0486, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.3946488294314383, |
|
"grad_norm": 0.6473740339279175, |
|
"learning_rate": 7.915072289798246e-07, |
|
"loss": 1.1637, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.408026755852843, |
|
"grad_norm": 0.6634021997451782, |
|
"learning_rate": 7.571065313191511e-07, |
|
"loss": 1.1053, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.4214046822742477, |
|
"grad_norm": 0.6495158672332764, |
|
"learning_rate": 7.233918228440323e-07, |
|
"loss": 1.0907, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.4347826086956523, |
|
"grad_norm": 0.6720609664916992, |
|
"learning_rate": 6.903702354339578e-07, |
|
"loss": 1.1751, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.448160535117057, |
|
"grad_norm": 0.6688068509101868, |
|
"learning_rate": 6.580487543482549e-07, |
|
"loss": 1.1408, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.6397896409034729, |
|
"learning_rate": 6.26434216748458e-07, |
|
"loss": 1.2012, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.4749163879598663, |
|
"grad_norm": 0.6841992735862732, |
|
"learning_rate": 5.955333102520011e-07, |
|
"loss": 1.2623, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.488294314381271, |
|
"grad_norm": 0.6013959050178528, |
|
"learning_rate": 5.653525715175483e-07, |
|
"loss": 1.0792, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.488294314381271, |
|
"eval_loss": 0.8052845597267151, |
|
"eval_runtime": 13.448, |
|
"eval_samples_per_second": 90.571, |
|
"eval_steps_per_second": 5.726, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.5016722408026757, |
|
"grad_norm": 0.595483660697937, |
|
"learning_rate": 5.358983848622451e-07, |
|
"loss": 1.1536, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.5150501672240804, |
|
"grad_norm": 0.6301653981208801, |
|
"learning_rate": 5.07176980911217e-07, |
|
"loss": 1.1543, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.528428093645485, |
|
"grad_norm": 0.6083581447601318, |
|
"learning_rate": 4.791944352795561e-07, |
|
"loss": 1.131, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.5418060200668897, |
|
"grad_norm": 0.6019948720932007, |
|
"learning_rate": 4.519566672871131e-07, |
|
"loss": 1.1022, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.5551839464882944, |
|
"grad_norm": 0.5989395976066589, |
|
"learning_rate": 4.2546943870635135e-07, |
|
"loss": 1.1402, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.568561872909699, |
|
"grad_norm": 0.575457751750946, |
|
"learning_rate": 3.997383525435154e-07, |
|
"loss": 1.0687, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.5819397993311037, |
|
"grad_norm": 0.6226676106452942, |
|
"learning_rate": 3.7476885185340023e-07, |
|
"loss": 1.158, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.5953177257525084, |
|
"grad_norm": 0.6265813112258911, |
|
"learning_rate": 3.5056621858794387e-07, |
|
"loss": 1.1689, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 0.6149746775627136, |
|
"learning_rate": 3.2713557247890447e-07, |
|
"loss": 1.1482, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.6220735785953178, |
|
"grad_norm": 0.555928647518158, |
|
"learning_rate": 3.0448186995485303e-07, |
|
"loss": 0.9814, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.6354515050167224, |
|
"grad_norm": 0.6666916608810425, |
|
"learning_rate": 2.826099030927098e-07, |
|
"loss": 1.2773, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.648829431438127, |
|
"grad_norm": 0.6038864850997925, |
|
"learning_rate": 2.6152429860404646e-07, |
|
"loss": 1.0263, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.6622073578595318, |
|
"grad_norm": 0.6544002890586853, |
|
"learning_rate": 2.412295168563667e-07, |
|
"loss": 1.2501, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.6755852842809364, |
|
"grad_norm": 0.5613058805465698, |
|
"learning_rate": 2.2172985092958128e-07, |
|
"loss": 1.0164, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.688963210702341, |
|
"grad_norm": 0.6110493540763855, |
|
"learning_rate": 2.0302942570786442e-07, |
|
"loss": 1.142, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.702341137123746, |
|
"grad_norm": 0.6497470140457153, |
|
"learning_rate": 1.851321970070927e-07, |
|
"loss": 1.1498, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.7157190635451505, |
|
"grad_norm": 0.6384419202804565, |
|
"learning_rate": 1.680419507380444e-07, |
|
"loss": 1.1044, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.729096989966555, |
|
"grad_norm": 0.6009129285812378, |
|
"learning_rate": 1.5176230210554742e-07, |
|
"loss": 1.13, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.74247491638796, |
|
"grad_norm": 0.5934394001960754, |
|
"learning_rate": 1.3629669484372718e-07, |
|
"loss": 1.0401, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.7558528428093645, |
|
"grad_norm": 0.669292151927948, |
|
"learning_rate": 1.21648400487536e-07, |
|
"loss": 1.2259, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 0.6402983665466309, |
|
"learning_rate": 1.0782051768070477e-07, |
|
"loss": 1.1421, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.782608695652174, |
|
"grad_norm": 0.6059122085571289, |
|
"learning_rate": 9.481597152026654e-08, |
|
"loss": 1.1015, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.7959866220735785, |
|
"grad_norm": 0.6018065214157104, |
|
"learning_rate": 8.263751293779408e-08, |
|
"loss": 1.1427, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.809364548494983, |
|
"grad_norm": 0.6087521910667419, |
|
"learning_rate": 7.128771811747736e-08, |
|
"loss": 1.1633, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.822742474916388, |
|
"grad_norm": 0.6287218928337097, |
|
"learning_rate": 6.076898795116792e-08, |
|
"loss": 1.1612, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.8361204013377925, |
|
"grad_norm": 0.6059502363204956, |
|
"learning_rate": 5.108354753050381e-08, |
|
"loss": 1.0879, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.849498327759197, |
|
"grad_norm": 0.5889873504638672, |
|
"learning_rate": 4.2233445676222114e-08, |
|
"loss": 1.1121, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.862876254180602, |
|
"grad_norm": 0.6066433787345886, |
|
"learning_rate": 3.422055450475847e-08, |
|
"loss": 1.102, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.8762541806020065, |
|
"grad_norm": 0.6160590648651123, |
|
"learning_rate": 2.7046569032227905e-08, |
|
"loss": 1.1017, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.8896321070234112, |
|
"grad_norm": 0.6136374473571777, |
|
"learning_rate": 2.0713006815868074e-08, |
|
"loss": 1.1346, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.903010033444816, |
|
"grad_norm": 0.6150422692298889, |
|
"learning_rate": 1.522120763301782e-08, |
|
"loss": 1.1271, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.9163879598662206, |
|
"grad_norm": 0.6284250617027283, |
|
"learning_rate": 1.0572333197711003e-08, |
|
"loss": 1.1855, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.9297658862876252, |
|
"grad_norm": 0.5995332598686218, |
|
"learning_rate": 6.767366914927297e-09, |
|
"loss": 1.1039, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.94314381270903, |
|
"grad_norm": 0.5566285252571106, |
|
"learning_rate": 3.807113672568807e-09, |
|
"loss": 1.0683, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.9565217391304346, |
|
"grad_norm": 0.5810141563415527, |
|
"learning_rate": 1.6921996711976028e-09, |
|
"loss": 1.1098, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.9698996655518393, |
|
"grad_norm": 0.6116142868995667, |
|
"learning_rate": 4.230722915701257e-10, |
|
"loss": 1.0441, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.983277591973244, |
|
"grad_norm": 0.6009790301322937, |
|
"learning_rate": 0.0, |
|
"loss": 1.1404, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.983277591973244, |
|
"eval_loss": 0.8089934587478638, |
|
"eval_runtime": 13.4287, |
|
"eval_samples_per_second": 90.701, |
|
"eval_steps_per_second": 5.734, |
|
"step": 222 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 222, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 37, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.24107627264947e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|