|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9990884229717412, |
|
"eval_steps": 137, |
|
"global_step": 548, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.07610916346311569, |
|
"learning_rate": 2e-05, |
|
"loss": 1.795, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 1.8087825775146484, |
|
"eval_runtime": 75.9539, |
|
"eval_samples_per_second": 65.829, |
|
"eval_steps_per_second": 16.457, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.0771929994225502, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7825, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.08941341191530228, |
|
"learning_rate": 6e-05, |
|
"loss": 1.7737, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.08335491269826889, |
|
"learning_rate": 8e-05, |
|
"loss": 1.8004, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.08835520595312119, |
|
"learning_rate": 0.0001, |
|
"loss": 1.8495, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.08816578984260559, |
|
"learning_rate": 0.00012, |
|
"loss": 1.7758, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.09536299854516983, |
|
"learning_rate": 0.00014, |
|
"loss": 1.8001, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.07634323835372925, |
|
"learning_rate": 0.00016, |
|
"loss": 1.7022, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.06886536628007889, |
|
"learning_rate": 0.00018, |
|
"loss": 1.8428, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.07389801740646362, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7598, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.06829163432121277, |
|
"learning_rate": 0.00019999981517295864, |
|
"loss": 1.7479, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.060045819729566574, |
|
"learning_rate": 0.0001999992606925178, |
|
"loss": 1.7454, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.08187604695558548, |
|
"learning_rate": 0.0001999983365607271, |
|
"loss": 1.7679, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.05995490401983261, |
|
"learning_rate": 0.00019999704278100263, |
|
"loss": 1.7599, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.055336710065603256, |
|
"learning_rate": 0.00019999537935812698, |
|
"loss": 1.8244, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.0541992112994194, |
|
"learning_rate": 0.00019999334629824895, |
|
"loss": 1.7756, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.05088195204734802, |
|
"learning_rate": 0.00019999094360888392, |
|
"loss": 1.7352, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.05157861113548279, |
|
"learning_rate": 0.00019998817129891346, |
|
"loss": 1.7634, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.055710840970277786, |
|
"learning_rate": 0.00019998502937858557, |
|
"loss": 1.7802, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.055150121450424194, |
|
"learning_rate": 0.00019998151785951448, |
|
"loss": 1.7445, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.0526655912399292, |
|
"learning_rate": 0.0001999776367546806, |
|
"loss": 1.6634, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.04809674620628357, |
|
"learning_rate": 0.00019997338607843075, |
|
"loss": 1.7277, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.049412671476602554, |
|
"learning_rate": 0.00019996876584647754, |
|
"loss": 1.7357, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.04948608949780464, |
|
"learning_rate": 0.00019996377607589997, |
|
"loss": 1.7323, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.050225820392370224, |
|
"learning_rate": 0.00019995841678514294, |
|
"loss": 1.7273, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.05085042864084244, |
|
"learning_rate": 0.00019995268799401718, |
|
"loss": 1.7564, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.04916631057858467, |
|
"learning_rate": 0.00019994658972369948, |
|
"loss": 1.7439, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.04791415110230446, |
|
"learning_rate": 0.00019994012199673234, |
|
"loss": 1.6813, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.04975065216422081, |
|
"learning_rate": 0.00019993328483702393, |
|
"loss": 1.691, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.055913638323545456, |
|
"learning_rate": 0.00019992607826984816, |
|
"loss": 1.7242, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.045829374343156815, |
|
"learning_rate": 0.00019991850232184435, |
|
"loss": 1.7334, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.053105831146240234, |
|
"learning_rate": 0.00019991055702101734, |
|
"loss": 1.7214, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.04539350047707558, |
|
"learning_rate": 0.00019990224239673722, |
|
"loss": 1.7698, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.046983517706394196, |
|
"learning_rate": 0.00019989355847973932, |
|
"loss": 1.6887, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.0471692830324173, |
|
"learning_rate": 0.00019988450530212414, |
|
"loss": 1.7571, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.046874694526195526, |
|
"learning_rate": 0.00019987508289735716, |
|
"loss": 1.7558, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.04474163055419922, |
|
"learning_rate": 0.00019986529130026857, |
|
"loss": 1.7465, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.044651810079813004, |
|
"learning_rate": 0.00019985513054705348, |
|
"loss": 1.6983, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.04951983690261841, |
|
"learning_rate": 0.00019984460067527153, |
|
"loss": 1.761, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.04424133151769638, |
|
"learning_rate": 0.00019983370172384682, |
|
"loss": 1.6383, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.052418872714042664, |
|
"learning_rate": 0.00019982243373306772, |
|
"loss": 1.779, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.04530750587582588, |
|
"learning_rate": 0.0001998107967445869, |
|
"loss": 1.6942, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.04790988191962242, |
|
"learning_rate": 0.0001997987908014209, |
|
"loss": 1.7053, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.04889607056975365, |
|
"learning_rate": 0.0001997864159479502, |
|
"loss": 1.7275, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.04314807429909706, |
|
"learning_rate": 0.00019977367222991893, |
|
"loss": 1.7393, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.04405505582690239, |
|
"learning_rate": 0.00019976055969443479, |
|
"loss": 1.7306, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.04656574875116348, |
|
"learning_rate": 0.00019974707838996882, |
|
"loss": 1.7686, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.04246290400624275, |
|
"learning_rate": 0.00019973322836635518, |
|
"loss": 1.7209, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.05493748560547829, |
|
"learning_rate": 0.00019971900967479106, |
|
"loss": 1.7155, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.0450466088950634, |
|
"learning_rate": 0.0001997044223678364, |
|
"loss": 1.6604, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.08634985238313675, |
|
"learning_rate": 0.00019968946649941382, |
|
"loss": 1.7321, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.04310084879398346, |
|
"learning_rate": 0.00019967414212480831, |
|
"loss": 1.7281, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.04666193947196007, |
|
"learning_rate": 0.000199658449300667, |
|
"loss": 1.6787, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.04957772046327591, |
|
"learning_rate": 0.00019964238808499907, |
|
"loss": 1.6919, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.0421697273850441, |
|
"learning_rate": 0.00019962595853717548, |
|
"loss": 1.7245, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.04654068127274513, |
|
"learning_rate": 0.0001996091607179287, |
|
"loss": 1.7123, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.04076274484395981, |
|
"learning_rate": 0.00019959199468935258, |
|
"loss": 1.7066, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.04215634986758232, |
|
"learning_rate": 0.00019957446051490198, |
|
"loss": 1.7748, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.04252045601606369, |
|
"learning_rate": 0.0001995565582593928, |
|
"loss": 1.7396, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.04455842077732086, |
|
"learning_rate": 0.00019953828798900135, |
|
"loss": 1.7236, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.044083647429943085, |
|
"learning_rate": 0.0001995196497712645, |
|
"loss": 1.7416, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.04511955380439758, |
|
"learning_rate": 0.00019950064367507916, |
|
"loss": 1.7481, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.0424315445125103, |
|
"learning_rate": 0.00019948126977070217, |
|
"loss": 1.7712, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.04309271275997162, |
|
"learning_rate": 0.00019946152812974993, |
|
"loss": 1.6927, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.042915165424346924, |
|
"learning_rate": 0.00019944141882519817, |
|
"loss": 1.7465, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.05950941890478134, |
|
"learning_rate": 0.00019942094193138186, |
|
"loss": 1.7035, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.042048510164022446, |
|
"learning_rate": 0.0001994000975239946, |
|
"loss": 1.7521, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.041577938944101334, |
|
"learning_rate": 0.00019937888568008862, |
|
"loss": 1.7439, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.04538682475686073, |
|
"learning_rate": 0.00019935730647807436, |
|
"loss": 1.7528, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.04102981090545654, |
|
"learning_rate": 0.00019933535999772025, |
|
"loss": 1.6828, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.04318905994296074, |
|
"learning_rate": 0.00019931304632015228, |
|
"loss": 1.7532, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.043007493019104004, |
|
"learning_rate": 0.00019929036552785397, |
|
"loss": 1.7353, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.04308176040649414, |
|
"learning_rate": 0.00019926731770466568, |
|
"loss": 1.6882, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.04227353632450104, |
|
"learning_rate": 0.00019924390293578472, |
|
"loss": 1.7302, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.0429629310965538, |
|
"learning_rate": 0.0001992201213077647, |
|
"loss": 1.6822, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.042203355580568314, |
|
"learning_rate": 0.00019919597290851538, |
|
"loss": 1.7601, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.04265713319182396, |
|
"learning_rate": 0.00019917145782730232, |
|
"loss": 1.7725, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.04848012328147888, |
|
"learning_rate": 0.00019914657615474653, |
|
"loss": 1.7587, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.042650256305933, |
|
"learning_rate": 0.00019912132798282408, |
|
"loss": 1.7422, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.04107372462749481, |
|
"learning_rate": 0.00019909571340486593, |
|
"loss": 1.7059, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.04788720980286598, |
|
"learning_rate": 0.00019906973251555734, |
|
"loss": 1.7205, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.041231803596019745, |
|
"learning_rate": 0.0001990433854109378, |
|
"loss": 1.7277, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.04246293380856514, |
|
"learning_rate": 0.0001990166721884004, |
|
"loss": 1.7739, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.04331424832344055, |
|
"learning_rate": 0.00019898959294669167, |
|
"loss": 1.6913, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.04720227047801018, |
|
"learning_rate": 0.00019896214778591115, |
|
"loss": 1.7079, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.05255519971251488, |
|
"learning_rate": 0.00019893433680751103, |
|
"loss": 1.7182, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.042392294853925705, |
|
"learning_rate": 0.00019890616011429568, |
|
"loss": 1.778, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.043008286505937576, |
|
"learning_rate": 0.0001988776178104214, |
|
"loss": 1.7518, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.044135116040706635, |
|
"learning_rate": 0.00019884871000139595, |
|
"loss": 1.7534, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.041827455163002014, |
|
"learning_rate": 0.00019881943679407832, |
|
"loss": 1.7291, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.05515114963054657, |
|
"learning_rate": 0.00019878979829667803, |
|
"loss": 1.7471, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.040826503187417984, |
|
"learning_rate": 0.00019875979461875503, |
|
"loss": 1.6408, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.04585504159331322, |
|
"learning_rate": 0.00019872942587121915, |
|
"loss": 1.6874, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.04665527120232582, |
|
"learning_rate": 0.00019869869216632968, |
|
"loss": 1.6968, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.046703219413757324, |
|
"learning_rate": 0.000198667593617695, |
|
"loss": 1.7401, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.04115475341677666, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 1.7227, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.04217168688774109, |
|
"learning_rate": 0.00019860430245036663, |
|
"loss": 1.7268, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.044889383018016815, |
|
"learning_rate": 0.00019857211006563125, |
|
"loss": 1.7006, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.04161443933844566, |
|
"learning_rate": 0.00019853955330506663, |
|
"loss": 1.7266, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.042708829045295715, |
|
"learning_rate": 0.00019850663228902012, |
|
"loss": 1.7314, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.046648308634757996, |
|
"learning_rate": 0.00019847334713918557, |
|
"loss": 1.7362, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.04414999857544899, |
|
"learning_rate": 0.00019843969797860294, |
|
"loss": 1.7065, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.04574083164334297, |
|
"learning_rate": 0.00019840568493165772, |
|
"loss": 1.7333, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.041924796998500824, |
|
"learning_rate": 0.0001983713081240805, |
|
"loss": 1.6517, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.04238827899098396, |
|
"learning_rate": 0.00019833656768294662, |
|
"loss": 1.776, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.04292167350649834, |
|
"learning_rate": 0.00019830146373667548, |
|
"loss": 1.6601, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.0433412566781044, |
|
"learning_rate": 0.00019826599641503025, |
|
"loss": 1.6841, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.04201202839612961, |
|
"learning_rate": 0.00019823016584911735, |
|
"loss": 1.764, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.04234587028622627, |
|
"learning_rate": 0.00019819397217138595, |
|
"loss": 1.7243, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.04268571734428406, |
|
"learning_rate": 0.0001981574155156274, |
|
"loss": 1.7656, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.041506245732307434, |
|
"learning_rate": 0.00019812049601697492, |
|
"loss": 1.6636, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.04152766987681389, |
|
"learning_rate": 0.00019808321381190294, |
|
"loss": 1.7478, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.041750356554985046, |
|
"learning_rate": 0.00019804556903822663, |
|
"loss": 1.7518, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.04935223609209061, |
|
"learning_rate": 0.00019800756183510144, |
|
"loss": 1.7673, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.042300984263420105, |
|
"learning_rate": 0.00019796919234302255, |
|
"loss": 1.7753, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.04224342852830887, |
|
"learning_rate": 0.00019793046070382437, |
|
"loss": 1.7226, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.044274065643548965, |
|
"learning_rate": 0.00019789136706067998, |
|
"loss": 1.7065, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.04910755529999733, |
|
"learning_rate": 0.00019785191155810062, |
|
"loss": 1.6387, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.04774147644639015, |
|
"learning_rate": 0.00019781209434193515, |
|
"loss": 1.7297, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.04416586086153984, |
|
"learning_rate": 0.00019777191555936957, |
|
"loss": 1.8096, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.04406105354428291, |
|
"learning_rate": 0.00019773137535892635, |
|
"loss": 1.7629, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.043473679572343826, |
|
"learning_rate": 0.00019769047389046402, |
|
"loss": 1.6979, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.04570621997117996, |
|
"learning_rate": 0.00019764921130517653, |
|
"loss": 1.7123, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.04326749965548515, |
|
"learning_rate": 0.00019760758775559274, |
|
"loss": 1.716, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.04397182539105415, |
|
"learning_rate": 0.00019756560339557572, |
|
"loss": 1.73, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.04468885809183121, |
|
"learning_rate": 0.00019752325838032244, |
|
"loss": 1.7136, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.04554520919919014, |
|
"learning_rate": 0.00019748055286636295, |
|
"loss": 1.7448, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.04646708443760872, |
|
"learning_rate": 0.00019743748701155995, |
|
"loss": 1.6956, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.042717937380075455, |
|
"learning_rate": 0.00019739406097510812, |
|
"loss": 1.7245, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.04367038235068321, |
|
"learning_rate": 0.00019735027491753353, |
|
"loss": 1.7102, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.04296841099858284, |
|
"learning_rate": 0.0001973061290006932, |
|
"loss": 1.7163, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.043665811419487, |
|
"learning_rate": 0.00019726162338777424, |
|
"loss": 1.7172, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.046134624630212784, |
|
"learning_rate": 0.00019721675824329354, |
|
"loss": 1.7327, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.04857848584651947, |
|
"learning_rate": 0.00019717153373309692, |
|
"loss": 1.6647, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.047723885625600815, |
|
"learning_rate": 0.00019712595002435861, |
|
"loss": 1.7422, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.04413154348731041, |
|
"learning_rate": 0.00019708000728558064, |
|
"loss": 1.6943, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.043105412274599075, |
|
"learning_rate": 0.00019703370568659225, |
|
"loss": 1.7519, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.7284438610076904, |
|
"eval_runtime": 76.3963, |
|
"eval_samples_per_second": 65.448, |
|
"eval_steps_per_second": 16.362, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.04300757125020027, |
|
"learning_rate": 0.00019698704539854918, |
|
"loss": 1.7341, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.043961744755506516, |
|
"learning_rate": 0.00019694002659393305, |
|
"loss": 1.777, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.04376057907938957, |
|
"learning_rate": 0.00019689264944655084, |
|
"loss": 1.7403, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.04482461139559746, |
|
"learning_rate": 0.00019684491413153411, |
|
"loss": 1.6852, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.045192863792181015, |
|
"learning_rate": 0.0001967968208253384, |
|
"loss": 1.7494, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.04361759498715401, |
|
"learning_rate": 0.00019674836970574254, |
|
"loss": 1.7331, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.04294734448194504, |
|
"learning_rate": 0.0001966995609518481, |
|
"loss": 1.6375, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.04528161138296127, |
|
"learning_rate": 0.00019665039474407863, |
|
"loss": 1.746, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.04510699212551117, |
|
"learning_rate": 0.00019660087126417906, |
|
"loss": 1.7053, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.042807720601558685, |
|
"learning_rate": 0.00019655099069521486, |
|
"loss": 1.6748, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.04657953232526779, |
|
"learning_rate": 0.00019650075322157168, |
|
"loss": 1.684, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.04593012481927872, |
|
"learning_rate": 0.00019645015902895437, |
|
"loss": 1.7076, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.04362139105796814, |
|
"learning_rate": 0.0001963992083043864, |
|
"loss": 1.6773, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.04773354157805443, |
|
"learning_rate": 0.00019634790123620926, |
|
"loss": 1.7107, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.05423569679260254, |
|
"learning_rate": 0.00019629623801408155, |
|
"loss": 1.7052, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.043550509959459305, |
|
"learning_rate": 0.00019624421882897855, |
|
"loss": 1.7151, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.04896851256489754, |
|
"learning_rate": 0.00019619184387319123, |
|
"loss": 1.6611, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.04392845928668976, |
|
"learning_rate": 0.00019613911334032583, |
|
"loss": 1.738, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.04582325741648674, |
|
"learning_rate": 0.00019608602742530283, |
|
"loss": 1.6885, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.045696284621953964, |
|
"learning_rate": 0.00019603258632435656, |
|
"loss": 1.7365, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.043873440474271774, |
|
"learning_rate": 0.00019597879023503417, |
|
"loss": 1.8094, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.05078018456697464, |
|
"learning_rate": 0.00019592463935619517, |
|
"loss": 1.7341, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.042483873665332794, |
|
"learning_rate": 0.00019587013388801047, |
|
"loss": 1.7351, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.045154914259910583, |
|
"learning_rate": 0.00019581527403196168, |
|
"loss": 1.6645, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.04563280567526817, |
|
"learning_rate": 0.0001957600599908406, |
|
"loss": 1.7069, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.0451313816010952, |
|
"learning_rate": 0.00019570449196874815, |
|
"loss": 1.7392, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.04682654142379761, |
|
"learning_rate": 0.0001956485701710938, |
|
"loss": 1.6987, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.04211273416876793, |
|
"learning_rate": 0.00019559229480459474, |
|
"loss": 1.6973, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.04460490494966507, |
|
"learning_rate": 0.00019553566607727517, |
|
"loss": 1.7233, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.044608812779188156, |
|
"learning_rate": 0.00019547868419846548, |
|
"loss": 1.7371, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.04518236592411995, |
|
"learning_rate": 0.00019542134937880154, |
|
"loss": 1.7257, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.04374237731099129, |
|
"learning_rate": 0.00019536366183022384, |
|
"loss": 1.7136, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.04429790750145912, |
|
"learning_rate": 0.00019530562176597673, |
|
"loss": 1.7216, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.04807354509830475, |
|
"learning_rate": 0.0001952472294006077, |
|
"loss": 1.6568, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.04785493016242981, |
|
"learning_rate": 0.00019518848494996655, |
|
"loss": 1.7272, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.04472104460000992, |
|
"learning_rate": 0.0001951293886312045, |
|
"loss": 1.7283, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.04852326214313507, |
|
"learning_rate": 0.00019506994066277348, |
|
"loss": 1.6968, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.04624422639608383, |
|
"learning_rate": 0.0001950101412644254, |
|
"loss": 1.758, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.044666189700365067, |
|
"learning_rate": 0.00019494999065721108, |
|
"loss": 1.6933, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.05367857217788696, |
|
"learning_rate": 0.0001948894890634798, |
|
"loss": 1.7328, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.046923939138650894, |
|
"learning_rate": 0.0001948286367068781, |
|
"loss": 1.7367, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.04480034112930298, |
|
"learning_rate": 0.00019476743381234926, |
|
"loss": 1.7677, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.045380428433418274, |
|
"learning_rate": 0.00019470588060613222, |
|
"loss": 1.7439, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.04550057277083397, |
|
"learning_rate": 0.00019464397731576094, |
|
"loss": 1.6895, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.049537234008312225, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 1.7274, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.04696514084935188, |
|
"learning_rate": 0.0001945191213991611, |
|
"loss": 1.7121, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.04783783480525017, |
|
"learning_rate": 0.00019445616923446755, |
|
"loss": 1.6942, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.04514686018228531, |
|
"learning_rate": 0.00019439286790868802, |
|
"loss": 1.7219, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.045743513852357864, |
|
"learning_rate": 0.00019432921765581847, |
|
"loss": 1.76, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.04406295716762543, |
|
"learning_rate": 0.00019426521871114468, |
|
"loss": 1.7531, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.04445353150367737, |
|
"learning_rate": 0.00019420087131124131, |
|
"loss": 1.7742, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.04396241530776024, |
|
"learning_rate": 0.0001941361756939712, |
|
"loss": 1.7701, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.04415050894021988, |
|
"learning_rate": 0.0001940711320984843, |
|
"loss": 1.7062, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.04672138765454292, |
|
"learning_rate": 0.00019400574076521693, |
|
"loss": 1.754, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.04417939484119415, |
|
"learning_rate": 0.00019394000193589088, |
|
"loss": 1.7357, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.04567494988441467, |
|
"learning_rate": 0.00019387391585351234, |
|
"loss": 1.752, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.045080311596393585, |
|
"learning_rate": 0.00019380748276237123, |
|
"loss": 1.736, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.04506627842783928, |
|
"learning_rate": 0.0001937407029080402, |
|
"loss": 1.6726, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.04523961618542671, |
|
"learning_rate": 0.0001936735765373737, |
|
"loss": 1.7621, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.04326867312192917, |
|
"learning_rate": 0.00019360610389850712, |
|
"loss": 1.7341, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.05188523977994919, |
|
"learning_rate": 0.00019353828524085577, |
|
"loss": 1.7277, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.04654062166810036, |
|
"learning_rate": 0.00019347012081511415, |
|
"loss": 1.6845, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.044841405004262924, |
|
"learning_rate": 0.0001934016108732548, |
|
"loss": 1.6611, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0941338911652565, |
|
"learning_rate": 0.00019333275566852756, |
|
"loss": 1.6978, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.05048836022615433, |
|
"learning_rate": 0.00019326355545545845, |
|
"loss": 1.7056, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.046358656138181686, |
|
"learning_rate": 0.00019319401048984892, |
|
"loss": 1.649, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.04557095095515251, |
|
"learning_rate": 0.00019312412102877473, |
|
"loss": 1.6793, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.04551040008664131, |
|
"learning_rate": 0.0001930538873305852, |
|
"loss": 1.7339, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.044258005917072296, |
|
"learning_rate": 0.000192983309654902, |
|
"loss": 1.6627, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.0485963337123394, |
|
"learning_rate": 0.00019291238826261843, |
|
"loss": 1.715, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.047103844583034515, |
|
"learning_rate": 0.00019284112341589832, |
|
"loss": 1.6855, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.045252177864313126, |
|
"learning_rate": 0.000192769515378175, |
|
"loss": 1.7557, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.049794841557741165, |
|
"learning_rate": 0.00019269756441415062, |
|
"loss": 1.7116, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.04380947723984718, |
|
"learning_rate": 0.00019262527078979478, |
|
"loss": 1.7663, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.046488065272569656, |
|
"learning_rate": 0.00019255263477234381, |
|
"loss": 1.6724, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.0422043539583683, |
|
"learning_rate": 0.00019247965663029976, |
|
"loss": 1.7345, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.05002991482615471, |
|
"learning_rate": 0.0001924063366334293, |
|
"loss": 1.7468, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.04376322776079178, |
|
"learning_rate": 0.0001923326750527628, |
|
"loss": 1.7748, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.04664807394146919, |
|
"learning_rate": 0.00019225867216059325, |
|
"loss": 1.7156, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.047952812165021896, |
|
"learning_rate": 0.0001921843282304754, |
|
"loss": 1.7247, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.045118216425180435, |
|
"learning_rate": 0.00019210964353722464, |
|
"loss": 1.7354, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.054903436452150345, |
|
"learning_rate": 0.00019203461835691594, |
|
"loss": 1.7241, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.04747498407959938, |
|
"learning_rate": 0.000191959252966883, |
|
"loss": 1.7498, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.04605628177523613, |
|
"learning_rate": 0.000191883547645717, |
|
"loss": 1.6889, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.04835960268974304, |
|
"learning_rate": 0.00019180750267326578, |
|
"loss": 1.715, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.04828386381268501, |
|
"learning_rate": 0.00019173111833063273, |
|
"loss": 1.6931, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.04604095220565796, |
|
"learning_rate": 0.0001916543949001756, |
|
"loss": 1.6717, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.049674633890390396, |
|
"learning_rate": 0.00019157733266550575, |
|
"loss": 1.7746, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.04439341649413109, |
|
"learning_rate": 0.00019149993191148687, |
|
"loss": 1.6925, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.04741811007261276, |
|
"learning_rate": 0.00019142219292423395, |
|
"loss": 1.7219, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.049409981817007065, |
|
"learning_rate": 0.00019134411599111242, |
|
"loss": 1.7306, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.04618163779377937, |
|
"learning_rate": 0.00019126570140073676, |
|
"loss": 1.7271, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.04557076469063759, |
|
"learning_rate": 0.0001911869494429698, |
|
"loss": 1.7188, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.04645569249987602, |
|
"learning_rate": 0.0001911078604089213, |
|
"loss": 1.7191, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.04584849998354912, |
|
"learning_rate": 0.0001910284345909471, |
|
"loss": 1.7592, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.045582644641399384, |
|
"learning_rate": 0.000190948672282648, |
|
"loss": 1.6902, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.04627401754260063, |
|
"learning_rate": 0.00019086857377886865, |
|
"loss": 1.6937, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.04470285400748253, |
|
"learning_rate": 0.00019078813937569643, |
|
"loss": 1.6977, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.05287547782063484, |
|
"learning_rate": 0.00019070736937046035, |
|
"loss": 1.7539, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.04990493878722191, |
|
"learning_rate": 0.00019062626406173006, |
|
"loss": 1.7469, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.048645589500665665, |
|
"learning_rate": 0.00019054482374931467, |
|
"loss": 1.7037, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.04730357602238655, |
|
"learning_rate": 0.0001904630487342616, |
|
"loss": 1.7388, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.04754168912768364, |
|
"learning_rate": 0.00019038093931885553, |
|
"loss": 1.7805, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.04760801047086716, |
|
"learning_rate": 0.00019029849580661727, |
|
"loss": 1.7383, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.048467203974723816, |
|
"learning_rate": 0.0001902157185023026, |
|
"loss": 1.7078, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.0522041916847229, |
|
"learning_rate": 0.00019013260771190126, |
|
"loss": 1.7052, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.0501788929104805, |
|
"learning_rate": 0.00019004916374263563, |
|
"loss": 1.7818, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.04538620635867119, |
|
"learning_rate": 0.00018996538690295979, |
|
"loss": 1.6589, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.04511679336428642, |
|
"learning_rate": 0.00018988127750255824, |
|
"loss": 1.7179, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.04756203666329384, |
|
"learning_rate": 0.0001897968358523448, |
|
"loss": 1.7333, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.05278336629271507, |
|
"learning_rate": 0.00018971206226446147, |
|
"loss": 1.7431, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.05926801264286041, |
|
"learning_rate": 0.00018962695705227728, |
|
"loss": 1.7768, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.049290940165519714, |
|
"learning_rate": 0.00018954152053038712, |
|
"loss": 1.7119, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.04777907952666283, |
|
"learning_rate": 0.0001894557530146106, |
|
"loss": 1.7559, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.04726920276880264, |
|
"learning_rate": 0.00018936965482199084, |
|
"loss": 1.7861, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.04677857458591461, |
|
"learning_rate": 0.0001892832262707933, |
|
"loss": 1.7039, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.04724700003862381, |
|
"learning_rate": 0.00018919646768050468, |
|
"loss": 1.6704, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.04969072341918945, |
|
"learning_rate": 0.00018910937937183166, |
|
"loss": 1.7168, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.04533353075385094, |
|
"learning_rate": 0.0001890219616666997, |
|
"loss": 1.6751, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.04647386819124222, |
|
"learning_rate": 0.0001889342148882519, |
|
"loss": 1.7146, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.047208696603775024, |
|
"learning_rate": 0.00018884613936084784, |
|
"loss": 1.7378, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.04841624200344086, |
|
"learning_rate": 0.0001887577354100623, |
|
"loss": 1.7128, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.05073019117116928, |
|
"learning_rate": 0.00018866900336268408, |
|
"loss": 1.7206, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.051456011831760406, |
|
"learning_rate": 0.00018857994354671482, |
|
"loss": 1.755, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.04637736827135086, |
|
"learning_rate": 0.0001884905562913678, |
|
"loss": 1.7395, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.061346374452114105, |
|
"learning_rate": 0.00018840084192706658, |
|
"loss": 1.674, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.04413258284330368, |
|
"learning_rate": 0.00018831080078544402, |
|
"loss": 1.7288, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.0531301349401474, |
|
"learning_rate": 0.0001882204331993409, |
|
"loss": 1.7625, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.05146196484565735, |
|
"learning_rate": 0.00018812973950280468, |
|
"loss": 1.6815, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.047678787261247635, |
|
"learning_rate": 0.0001880387200310883, |
|
"loss": 1.7278, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.0556582510471344, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 1.74, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.047515787184238434, |
|
"learning_rate": 0.00018785570510914678, |
|
"loss": 1.7207, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.04592055827379227, |
|
"learning_rate": 0.0001877637103354438, |
|
"loss": 1.6589, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.04531411454081535, |
|
"learning_rate": 0.0001876713911396024, |
|
"loss": 1.706, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.04682420939207077, |
|
"learning_rate": 0.0001875787478628843, |
|
"loss": 1.7297, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.04545978829264641, |
|
"learning_rate": 0.00018748578084774913, |
|
"loss": 1.6572, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.04849430173635483, |
|
"learning_rate": 0.00018739249043785324, |
|
"loss": 1.7442, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.726025938987732, |
|
"eval_runtime": 76.0967, |
|
"eval_samples_per_second": 65.706, |
|
"eval_steps_per_second": 16.426, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.04745488613843918, |
|
"learning_rate": 0.00018729887697804847, |
|
"loss": 1.7398, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.05489857494831085, |
|
"learning_rate": 0.00018720494081438078, |
|
"loss": 1.701, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.04818108305335045, |
|
"learning_rate": 0.00018711068229408903, |
|
"loss": 1.7068, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.04530555009841919, |
|
"learning_rate": 0.0001870161017656037, |
|
"loss": 1.6966, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.045606572180986404, |
|
"learning_rate": 0.00018692119957854558, |
|
"loss": 1.7086, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.04626869410276413, |
|
"learning_rate": 0.00018682597608372445, |
|
"loss": 1.6981, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.04752146080136299, |
|
"learning_rate": 0.0001867304316331379, |
|
"loss": 1.692, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.046230729669332504, |
|
"learning_rate": 0.0001866345665799698, |
|
"loss": 1.7338, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.04928119108080864, |
|
"learning_rate": 0.00018653838127858933, |
|
"loss": 1.738, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.04641352593898773, |
|
"learning_rate": 0.00018644187608454936, |
|
"loss": 1.6792, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.04860611632466316, |
|
"learning_rate": 0.00018634505135458525, |
|
"loss": 1.663, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.046515002846717834, |
|
"learning_rate": 0.00018624790744661355, |
|
"loss": 1.7327, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.04668186604976654, |
|
"learning_rate": 0.00018615044471973074, |
|
"loss": 1.6987, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.047913163900375366, |
|
"learning_rate": 0.00018605266353421176, |
|
"loss": 1.7953, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.04924839362502098, |
|
"learning_rate": 0.00018595456425150872, |
|
"loss": 1.7891, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.049241986125707626, |
|
"learning_rate": 0.00018585614723424962, |
|
"loss": 1.7451, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.05132036283612251, |
|
"learning_rate": 0.00018575741284623703, |
|
"loss": 1.7598, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.04659922048449516, |
|
"learning_rate": 0.00018565836145244662, |
|
"loss": 1.7331, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.0466977022588253, |
|
"learning_rate": 0.0001855589934190259, |
|
"loss": 1.7171, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.049368374049663544, |
|
"learning_rate": 0.00018545930911329287, |
|
"loss": 1.6929, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.04552480950951576, |
|
"learning_rate": 0.00018535930890373466, |
|
"loss": 1.753, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.04755065590143204, |
|
"learning_rate": 0.00018525899316000608, |
|
"loss": 1.7472, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.050540413707494736, |
|
"learning_rate": 0.0001851583622529284, |
|
"loss": 1.7585, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.04644971713423729, |
|
"learning_rate": 0.00018505741655448792, |
|
"loss": 1.7531, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.05085503309965134, |
|
"learning_rate": 0.00018495615643783446, |
|
"loss": 1.6954, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.0480993427336216, |
|
"learning_rate": 0.0001848545822772802, |
|
"loss": 1.6976, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.0487300269305706, |
|
"learning_rate": 0.00018475269444829818, |
|
"loss": 1.7642, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.04805615171790123, |
|
"learning_rate": 0.0001846504933275209, |
|
"loss": 1.6666, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.045554857701063156, |
|
"learning_rate": 0.00018454797929273902, |
|
"loss": 1.7259, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.04570743814110756, |
|
"learning_rate": 0.00018444515272289982, |
|
"loss": 1.7067, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.047652073204517365, |
|
"learning_rate": 0.00018434201399810594, |
|
"loss": 1.8147, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.046781569719314575, |
|
"learning_rate": 0.00018423856349961384, |
|
"loss": 1.7509, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.04698612168431282, |
|
"learning_rate": 0.00018413480160983254, |
|
"loss": 1.7074, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.04796341061592102, |
|
"learning_rate": 0.0001840307287123221, |
|
"loss": 1.7444, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.047553375363349915, |
|
"learning_rate": 0.00018392634519179225, |
|
"loss": 1.7103, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.046323925256729126, |
|
"learning_rate": 0.00018382165143410092, |
|
"loss": 1.716, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.04571986570954323, |
|
"learning_rate": 0.00018371664782625287, |
|
"loss": 1.7035, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.05170504003763199, |
|
"learning_rate": 0.0001836113347563982, |
|
"loss": 1.7151, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.047869808971881866, |
|
"learning_rate": 0.000183505712613831, |
|
"loss": 1.7223, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.0482964813709259, |
|
"learning_rate": 0.0001833997817889878, |
|
"loss": 1.6805, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.0486602708697319, |
|
"learning_rate": 0.00018329354267344625, |
|
"loss": 1.7303, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.046554964035749435, |
|
"learning_rate": 0.00018318699565992357, |
|
"loss": 1.7745, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.047917045652866364, |
|
"learning_rate": 0.00018308014114227513, |
|
"loss": 1.718, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.0479004867374897, |
|
"learning_rate": 0.00018297297951549304, |
|
"loss": 1.7707, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.04681101068854332, |
|
"learning_rate": 0.0001828655111757046, |
|
"loss": 1.7646, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.05201521888375282, |
|
"learning_rate": 0.00018275773652017097, |
|
"loss": 1.7479, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.04852493852376938, |
|
"learning_rate": 0.00018264965594728548, |
|
"loss": 1.7463, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.046121757477521896, |
|
"learning_rate": 0.00018254126985657246, |
|
"loss": 1.7444, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.05163992941379547, |
|
"learning_rate": 0.00018243257864868548, |
|
"loss": 1.7134, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.06267976760864258, |
|
"learning_rate": 0.00018232358272540604, |
|
"loss": 1.6712, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.04854287579655647, |
|
"learning_rate": 0.00018221428248964202, |
|
"loss": 1.6932, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.046650100499391556, |
|
"learning_rate": 0.00018210467834542615, |
|
"loss": 1.768, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.04779491573572159, |
|
"learning_rate": 0.00018199477069791474, |
|
"loss": 1.7109, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.05170130729675293, |
|
"learning_rate": 0.0001818845599533858, |
|
"loss": 1.6926, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.04867775738239288, |
|
"learning_rate": 0.00018177404651923787, |
|
"loss": 1.6908, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.04707460105419159, |
|
"learning_rate": 0.00018166323080398835, |
|
"loss": 1.7461, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.048908475786447525, |
|
"learning_rate": 0.00018155211321727212, |
|
"loss": 1.7214, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.04802173003554344, |
|
"learning_rate": 0.00018144069416983985, |
|
"loss": 1.7528, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.04747573658823967, |
|
"learning_rate": 0.00018132897407355657, |
|
"loss": 1.6726, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.049620069563388824, |
|
"learning_rate": 0.00018121695334140017, |
|
"loss": 1.7215, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.047733817249536514, |
|
"learning_rate": 0.00018110463238745988, |
|
"loss": 1.7538, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.04856455698609352, |
|
"learning_rate": 0.00018099201162693476, |
|
"loss": 1.6833, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.04885758087038994, |
|
"learning_rate": 0.00018087909147613193, |
|
"loss": 1.7141, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.047947369515895844, |
|
"learning_rate": 0.0001807658723524654, |
|
"loss": 1.733, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.0499010868370533, |
|
"learning_rate": 0.0001806523546744543, |
|
"loss": 1.6825, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.048193834722042084, |
|
"learning_rate": 0.0001805385388617213, |
|
"loss": 1.7282, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.05272866412997246, |
|
"learning_rate": 0.00018042442533499123, |
|
"loss": 1.7599, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.047657158225774765, |
|
"learning_rate": 0.00018031001451608943, |
|
"loss": 1.7292, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.0498197004199028, |
|
"learning_rate": 0.00018019530682794014, |
|
"loss": 1.7417, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.04958554729819298, |
|
"learning_rate": 0.00018008030269456505, |
|
"loss": 1.7274, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.04730832576751709, |
|
"learning_rate": 0.00017996500254108152, |
|
"loss": 1.778, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.050828639417886734, |
|
"learning_rate": 0.0001798494067937014, |
|
"loss": 1.7285, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.046292368322610855, |
|
"learning_rate": 0.00017973351587972905, |
|
"loss": 1.7334, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.04758565500378609, |
|
"learning_rate": 0.00017961733022755992, |
|
"loss": 1.6814, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.050507742911577225, |
|
"learning_rate": 0.00017950085026667903, |
|
"loss": 1.6949, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.04801836982369423, |
|
"learning_rate": 0.00017938407642765938, |
|
"loss": 1.6594, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.04616666957736015, |
|
"learning_rate": 0.00017926700914216016, |
|
"loss": 1.6969, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.048213839530944824, |
|
"learning_rate": 0.00017914964884292544, |
|
"loss": 1.6908, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.04909725859761238, |
|
"learning_rate": 0.00017903199596378227, |
|
"loss": 1.7213, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.050252340734004974, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 1.7094, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.05401075631380081, |
|
"learning_rate": 0.00017879581420648534, |
|
"loss": 1.7163, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.05027545616030693, |
|
"learning_rate": 0.00017867728620138708, |
|
"loss": 1.7362, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.047479428350925446, |
|
"learning_rate": 0.00017855846736248822, |
|
"loss": 1.6785, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.05026884377002716, |
|
"learning_rate": 0.0001784393581290074, |
|
"loss": 1.7221, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.04901432618498802, |
|
"learning_rate": 0.00017831995894123683, |
|
"loss": 1.6401, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.04764765873551369, |
|
"learning_rate": 0.00017820027024054044, |
|
"loss": 1.7361, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.046871528029441833, |
|
"learning_rate": 0.0001780802924693524, |
|
"loss": 1.7986, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.05453401803970337, |
|
"learning_rate": 0.00017796002607117545, |
|
"loss": 1.7447, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.04958674684166908, |
|
"learning_rate": 0.00017783947149057925, |
|
"loss": 1.7091, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.053141675889492035, |
|
"learning_rate": 0.0001777186291731987, |
|
"loss": 1.6866, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.047340743243694305, |
|
"learning_rate": 0.00017759749956573238, |
|
"loss": 1.7191, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.051203418523073196, |
|
"learning_rate": 0.00017747608311594087, |
|
"loss": 1.7238, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.047188933938741684, |
|
"learning_rate": 0.00017735438027264495, |
|
"loss": 1.762, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.056479763239622116, |
|
"learning_rate": 0.00017723239148572422, |
|
"loss": 1.6587, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.04922572523355484, |
|
"learning_rate": 0.00017711011720611514, |
|
"loss": 1.6988, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.046839334070682526, |
|
"learning_rate": 0.00017698755788580963, |
|
"loss": 1.7092, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.0491393506526947, |
|
"learning_rate": 0.0001768647139778532, |
|
"loss": 1.7313, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.04811710864305496, |
|
"learning_rate": 0.0001767415859363434, |
|
"loss": 1.8071, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.04601633548736572, |
|
"learning_rate": 0.00017661817421642804, |
|
"loss": 1.7594, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.05098440870642662, |
|
"learning_rate": 0.00017649447927430362, |
|
"loss": 1.6524, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.04978582262992859, |
|
"learning_rate": 0.00017637050156721346, |
|
"loss": 1.7448, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.05097389221191406, |
|
"learning_rate": 0.00017624624155344626, |
|
"loss": 1.7362, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.05258944630622864, |
|
"learning_rate": 0.00017612169969233424, |
|
"loss": 1.7033, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.05384654179215431, |
|
"learning_rate": 0.0001759968764442515, |
|
"loss": 1.6349, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.047803860157728195, |
|
"learning_rate": 0.00017587177227061226, |
|
"loss": 1.6655, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.04812454432249069, |
|
"learning_rate": 0.00017574638763386916, |
|
"loss": 1.7064, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.04860275238752365, |
|
"learning_rate": 0.00017562072299751163, |
|
"loss": 1.6648, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.049836620688438416, |
|
"learning_rate": 0.00017549477882606418, |
|
"loss": 1.6957, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.05114325135946274, |
|
"learning_rate": 0.00017536855558508458, |
|
"loss": 1.6257, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.054609425365924835, |
|
"learning_rate": 0.00017524205374116214, |
|
"loss": 1.6854, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.04757620766758919, |
|
"learning_rate": 0.00017511527376191618, |
|
"loss": 1.7425, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.05384545028209686, |
|
"learning_rate": 0.00017498821611599397, |
|
"loss": 1.712, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.04726232588291168, |
|
"learning_rate": 0.00017486088127306932, |
|
"loss": 1.701, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.04885297268629074, |
|
"learning_rate": 0.0001747332697038407, |
|
"loss": 1.7227, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.04793693870306015, |
|
"learning_rate": 0.00017460538188002946, |
|
"loss": 1.7058, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.04942973330616951, |
|
"learning_rate": 0.0001744772182743782, |
|
"loss": 1.7443, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.05246872082352638, |
|
"learning_rate": 0.00017434877936064886, |
|
"loss": 1.6807, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.04894121363759041, |
|
"learning_rate": 0.0001742200656136212, |
|
"loss": 1.7963, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.05082324892282486, |
|
"learning_rate": 0.00017409107750909078, |
|
"loss": 1.7024, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.04718152433633804, |
|
"learning_rate": 0.00017396181552386741, |
|
"loss": 1.711, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.05174902826547623, |
|
"learning_rate": 0.00017383228013577331, |
|
"loss": 1.7362, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.048003047704696655, |
|
"learning_rate": 0.0001737024718236413, |
|
"loss": 1.6944, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.0462164506316185, |
|
"learning_rate": 0.00017357239106731317, |
|
"loss": 1.7297, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.04808316007256508, |
|
"learning_rate": 0.0001734420383476377, |
|
"loss": 1.6971, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.05553476884961128, |
|
"learning_rate": 0.00017331141414646904, |
|
"loss": 1.7262, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.046341411769390106, |
|
"learning_rate": 0.00017318051894666487, |
|
"loss": 1.7135, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.048155754804611206, |
|
"learning_rate": 0.00017304935323208466, |
|
"loss": 1.7377, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.05066389963030815, |
|
"learning_rate": 0.00017291791748758785, |
|
"loss": 1.6516, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.05046610161662102, |
|
"learning_rate": 0.000172786212199032, |
|
"loss": 1.7536, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.0542440302670002, |
|
"learning_rate": 0.00017265423785327107, |
|
"loss": 1.7857, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.04833053797483444, |
|
"learning_rate": 0.0001725219949381537, |
|
"loss": 1.7594, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.047335654497146606, |
|
"learning_rate": 0.00017238948394252115, |
|
"loss": 1.7495, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.04961543157696724, |
|
"learning_rate": 0.00017225670535620576, |
|
"loss": 1.7201, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.04761854186654091, |
|
"learning_rate": 0.00017212365967002893, |
|
"loss": 1.7522, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.05010442063212395, |
|
"learning_rate": 0.0001719903473757996, |
|
"loss": 1.7535, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.049323149025440216, |
|
"learning_rate": 0.000171856768966312, |
|
"loss": 1.6984, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.08661342412233353, |
|
"learning_rate": 0.0001717229249353442, |
|
"loss": 1.7182, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.724851131439209, |
|
"eval_runtime": 76.3068, |
|
"eval_samples_per_second": 65.525, |
|
"eval_steps_per_second": 16.381, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.05118868127465248, |
|
"learning_rate": 0.00017158881577765612, |
|
"loss": 1.683, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.053089968860149384, |
|
"learning_rate": 0.00017145444198898776, |
|
"loss": 1.7162, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.05191902816295624, |
|
"learning_rate": 0.0001713198040660573, |
|
"loss": 1.7223, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.05995416268706322, |
|
"learning_rate": 0.00017118490250655932, |
|
"loss": 1.7148, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.04749016463756561, |
|
"learning_rate": 0.00017104973780916294, |
|
"loss": 1.7364, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.047870930284261703, |
|
"learning_rate": 0.00017091431047351, |
|
"loss": 1.7607, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.04802364483475685, |
|
"learning_rate": 0.00017077862100021318, |
|
"loss": 1.6957, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.04796374961733818, |
|
"learning_rate": 0.00017064266989085412, |
|
"loss": 1.6972, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.048874564468860626, |
|
"learning_rate": 0.00017050645764798164, |
|
"loss": 1.736, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.052477337419986725, |
|
"learning_rate": 0.00017036998477510992, |
|
"loss": 1.7447, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.049993280321359634, |
|
"learning_rate": 0.00017023325177671647, |
|
"loss": 1.7635, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.09700744599103928, |
|
"learning_rate": 0.00017009625915824037, |
|
"loss": 1.7402, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.048865802586078644, |
|
"learning_rate": 0.0001699590074260805, |
|
"loss": 1.7229, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.04994821920990944, |
|
"learning_rate": 0.00016982149708759343, |
|
"loss": 1.672, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.05008814111351967, |
|
"learning_rate": 0.00016968372865109176, |
|
"loss": 1.7338, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.04830687865614891, |
|
"learning_rate": 0.00016954570262584214, |
|
"loss": 1.7177, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.04781452193856239, |
|
"learning_rate": 0.0001694074195220634, |
|
"loss": 1.7628, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.04739667847752571, |
|
"learning_rate": 0.00016926887985092468, |
|
"loss": 1.7107, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.0481286458671093, |
|
"learning_rate": 0.00016913008412454357, |
|
"loss": 1.7646, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.06283537298440933, |
|
"learning_rate": 0.0001689910328559841, |
|
"loss": 1.6896, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.04944480583071709, |
|
"learning_rate": 0.00016885172655925495, |
|
"loss": 1.6931, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.05051645264029503, |
|
"learning_rate": 0.00016871216574930754, |
|
"loss": 1.7752, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.05406402051448822, |
|
"learning_rate": 0.0001685723509420341, |
|
"loss": 1.7203, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.0995137020945549, |
|
"learning_rate": 0.00016843228265426584, |
|
"loss": 1.6454, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.05356389284133911, |
|
"learning_rate": 0.00016829196140377085, |
|
"loss": 1.7327, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.04902141913771629, |
|
"learning_rate": 0.0001681513877092523, |
|
"loss": 1.7262, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.047820378094911575, |
|
"learning_rate": 0.00016801056209034672, |
|
"loss": 1.7294, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.048359643667936325, |
|
"learning_rate": 0.00016786948506762164, |
|
"loss": 1.6959, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.04830753803253174, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 1.7714, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.05318046733736992, |
|
"learning_rate": 0.0001675865788976285, |
|
"loss": 1.7325, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.04992082715034485, |
|
"learning_rate": 0.0001674447507961346, |
|
"loss": 1.7866, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.05253741890192032, |
|
"learning_rate": 0.0001673026733823658, |
|
"loss": 1.7273, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.05121272802352905, |
|
"learning_rate": 0.00016716034718151706, |
|
"loss": 1.7063, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.04715156927704811, |
|
"learning_rate": 0.000167017772719703, |
|
"loss": 1.7575, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.05717930197715759, |
|
"learning_rate": 0.00016687495052395595, |
|
"loss": 1.7835, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.04992460459470749, |
|
"learning_rate": 0.00016673188112222394, |
|
"loss": 1.7218, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.0481155663728714, |
|
"learning_rate": 0.0001665885650433689, |
|
"loss": 1.7269, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.0485762394964695, |
|
"learning_rate": 0.00016644500281716456, |
|
"loss": 1.6857, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.04729575663805008, |
|
"learning_rate": 0.00016630119497429457, |
|
"loss": 1.7208, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.051819782704114914, |
|
"learning_rate": 0.00016615714204635043, |
|
"loss": 1.7117, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.052782051265239716, |
|
"learning_rate": 0.0001660128445658297, |
|
"loss": 1.7811, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.05251288414001465, |
|
"learning_rate": 0.00016586830306613393, |
|
"loss": 1.7517, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.047806352376937866, |
|
"learning_rate": 0.00016572351808156666, |
|
"loss": 1.7132, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.05114049091935158, |
|
"learning_rate": 0.0001655784901473315, |
|
"loss": 1.7729, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.04811178147792816, |
|
"learning_rate": 0.00016543321979953007, |
|
"loss": 1.7855, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.05107167363166809, |
|
"learning_rate": 0.00016528770757516027, |
|
"loss": 1.7331, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.04712466895580292, |
|
"learning_rate": 0.00016514195401211388, |
|
"loss": 1.7048, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.05438878387212753, |
|
"learning_rate": 0.0001649959596491749, |
|
"loss": 1.753, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.04884348064661026, |
|
"learning_rate": 0.00016484972502601753, |
|
"loss": 1.6734, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.0536276139318943, |
|
"learning_rate": 0.00016470325068320392, |
|
"loss": 1.711, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.05346493422985077, |
|
"learning_rate": 0.00016455653716218252, |
|
"loss": 1.7366, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.05044522508978844, |
|
"learning_rate": 0.0001644095850052858, |
|
"loss": 1.7269, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.05273488536477089, |
|
"learning_rate": 0.00016426239475572852, |
|
"loss": 1.7586, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.053452517837285995, |
|
"learning_rate": 0.0001641149669576053, |
|
"loss": 1.7379, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.047611016780138016, |
|
"learning_rate": 0.00016396730215588915, |
|
"loss": 1.7471, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.05317235738039017, |
|
"learning_rate": 0.00016381940089642893, |
|
"loss": 1.6925, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.049223560839891434, |
|
"learning_rate": 0.00016367126372594774, |
|
"loss": 1.7229, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.047821756452322006, |
|
"learning_rate": 0.0001635228911920407, |
|
"loss": 1.7484, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.05013042315840721, |
|
"learning_rate": 0.00016337428384317288, |
|
"loss": 1.7435, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.04820725694298744, |
|
"learning_rate": 0.00016322544222867742, |
|
"loss": 1.7594, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.04791193827986717, |
|
"learning_rate": 0.00016307636689875347, |
|
"loss": 1.644, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.04905365779995918, |
|
"learning_rate": 0.00016292705840446404, |
|
"loss": 1.7144, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.04875028133392334, |
|
"learning_rate": 0.00016277751729773407, |
|
"loss": 1.712, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.05170164629817009, |
|
"learning_rate": 0.0001626277441313484, |
|
"loss": 1.7367, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.05205371975898743, |
|
"learning_rate": 0.00016247773945894962, |
|
"loss": 1.689, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.0485403798520565, |
|
"learning_rate": 0.00016232750383503617, |
|
"loss": 1.706, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.0538201630115509, |
|
"learning_rate": 0.0001621770378149601, |
|
"loss": 1.7284, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.04828377440571785, |
|
"learning_rate": 0.00016202634195492524, |
|
"loss": 1.661, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.050310611724853516, |
|
"learning_rate": 0.000161875416811985, |
|
"loss": 1.6852, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.050804853439331055, |
|
"learning_rate": 0.00016172426294404032, |
|
"loss": 1.7358, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.051962971687316895, |
|
"learning_rate": 0.00016157288090983763, |
|
"loss": 1.6692, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.05179814621806145, |
|
"learning_rate": 0.0001614212712689668, |
|
"loss": 1.6983, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.05398216098546982, |
|
"learning_rate": 0.00016126943458185907, |
|
"loss": 1.7261, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.049869704991579056, |
|
"learning_rate": 0.00016111737140978494, |
|
"loss": 1.6951, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.048107776790857315, |
|
"learning_rate": 0.00016096508231485217, |
|
"loss": 1.6941, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.05527656897902489, |
|
"learning_rate": 0.00016081256786000357, |
|
"loss": 1.7054, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.05169270187616348, |
|
"learning_rate": 0.00016065982860901504, |
|
"loss": 1.7307, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.04972197115421295, |
|
"learning_rate": 0.00016050686512649354, |
|
"loss": 1.6955, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.05033208429813385, |
|
"learning_rate": 0.00016035367797787476, |
|
"loss": 1.7013, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.05073223263025284, |
|
"learning_rate": 0.00016020026772942125, |
|
"loss": 1.6831, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.056367356330156326, |
|
"learning_rate": 0.00016004663494822028, |
|
"loss": 1.6654, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.049483008682727814, |
|
"learning_rate": 0.0001598927802021817, |
|
"loss": 1.7285, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.052070703357458115, |
|
"learning_rate": 0.00015973870406003578, |
|
"loss": 1.7948, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.05687413364648819, |
|
"learning_rate": 0.0001595844070913314, |
|
"loss": 1.7336, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.048987727612257004, |
|
"learning_rate": 0.00015942988986643352, |
|
"loss": 1.6661, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.05027730017900467, |
|
"learning_rate": 0.00015927515295652143, |
|
"loss": 1.7364, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.048406291753053665, |
|
"learning_rate": 0.00015912019693358636, |
|
"loss": 1.6419, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.05071192979812622, |
|
"learning_rate": 0.00015896502237042963, |
|
"loss": 1.6301, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.05111885070800781, |
|
"learning_rate": 0.00015880962984066036, |
|
"loss": 1.7112, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.06297910958528519, |
|
"learning_rate": 0.0001586540199186933, |
|
"loss": 1.7438, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.04950469359755516, |
|
"learning_rate": 0.00015849819317974694, |
|
"loss": 1.6837, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.04900701716542244, |
|
"learning_rate": 0.0001583421501998412, |
|
"loss": 1.7432, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.04949019104242325, |
|
"learning_rate": 0.0001581858915557953, |
|
"loss": 1.688, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.05047097057104111, |
|
"learning_rate": 0.00015802941782522569, |
|
"loss": 1.7256, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.04921870306134224, |
|
"learning_rate": 0.0001578727295865439, |
|
"loss": 1.7723, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.04841122031211853, |
|
"learning_rate": 0.0001577158274189544, |
|
"loss": 1.71, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.04886234924197197, |
|
"learning_rate": 0.00015755871190245251, |
|
"loss": 1.6622, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.04966573417186737, |
|
"learning_rate": 0.00015740138361782207, |
|
"loss": 1.7357, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.050070296972990036, |
|
"learning_rate": 0.0001572438431466336, |
|
"loss": 1.6803, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.054121073335409164, |
|
"learning_rate": 0.00015708609107124177, |
|
"loss": 1.7659, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.05084529519081116, |
|
"learning_rate": 0.00015692812797478368, |
|
"loss": 1.6943, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.056926507502794266, |
|
"learning_rate": 0.0001567699544411763, |
|
"loss": 1.6562, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.05053721368312836, |
|
"learning_rate": 0.00015661157105511457, |
|
"loss": 1.7624, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.048727016896009445, |
|
"learning_rate": 0.00015645297840206915, |
|
"loss": 1.7364, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.051376283168792725, |
|
"learning_rate": 0.00015629417706828423, |
|
"loss": 1.699, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.05029591917991638, |
|
"learning_rate": 0.00015613516764077548, |
|
"loss": 1.6972, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.053968969732522964, |
|
"learning_rate": 0.00015597595070732765, |
|
"loss": 1.7128, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.050694871693849564, |
|
"learning_rate": 0.00015581652685649276, |
|
"loss": 1.7681, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.052369993180036545, |
|
"learning_rate": 0.00015565689667758746, |
|
"loss": 1.7321, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.04850650206208229, |
|
"learning_rate": 0.00015549706076069128, |
|
"loss": 1.7162, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.04979635775089264, |
|
"learning_rate": 0.00015533701969664424, |
|
"loss": 1.7429, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.04920853301882744, |
|
"learning_rate": 0.0001551767740770446, |
|
"loss": 1.7103, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.05081456899642944, |
|
"learning_rate": 0.0001550163244942469, |
|
"loss": 1.7781, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.050754062831401825, |
|
"learning_rate": 0.00015485567154135952, |
|
"loss": 1.7496, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.050315603613853455, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 1.7303, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.05050061643123627, |
|
"learning_rate": 0.00015453375790150617, |
|
"loss": 1.679, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06212810054421425, |
|
"learning_rate": 0.00015437249840450715, |
|
"loss": 1.713, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.050966355949640274, |
|
"learning_rate": 0.00015421103791734786, |
|
"loss": 1.7551, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.04892159253358841, |
|
"learning_rate": 0.00015404937703687363, |
|
"loss": 1.6758, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.05551762133836746, |
|
"learning_rate": 0.00015388751636067052, |
|
"loss": 1.703, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.0516047477722168, |
|
"learning_rate": 0.00015372545648706306, |
|
"loss": 1.7407, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.05094458907842636, |
|
"learning_rate": 0.0001535631980151123, |
|
"loss": 1.6534, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.05045678839087486, |
|
"learning_rate": 0.00015340074154461316, |
|
"loss": 1.7335, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.05067756026983261, |
|
"learning_rate": 0.00015323808767609277, |
|
"loss": 1.7169, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.05005278438329697, |
|
"learning_rate": 0.00015307523701080768, |
|
"loss": 1.7778, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.04952746629714966, |
|
"learning_rate": 0.0001529121901507421, |
|
"loss": 1.7199, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.04711218178272247, |
|
"learning_rate": 0.00015274894769860538, |
|
"loss": 1.734, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.05313078686594963, |
|
"learning_rate": 0.0001525855102578299, |
|
"loss": 1.7733, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.04977120831608772, |
|
"learning_rate": 0.0001524218784325688, |
|
"loss": 1.731, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.05076899752020836, |
|
"learning_rate": 0.00015225805282769383, |
|
"loss": 1.7277, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.049164701253175735, |
|
"learning_rate": 0.00015209403404879303, |
|
"loss": 1.7032, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.0488349013030529, |
|
"learning_rate": 0.00015192982270216854, |
|
"loss": 1.765, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.04831582307815552, |
|
"learning_rate": 0.0001517654193948343, |
|
"loss": 1.7548, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.052940741181373596, |
|
"learning_rate": 0.00015160082473451378, |
|
"loss": 1.7209, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.056908875703811646, |
|
"learning_rate": 0.00015143603932963795, |
|
"loss": 1.6537, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.0509711354970932, |
|
"learning_rate": 0.00015127106378934273, |
|
"loss": 1.7151, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.04795239865779877, |
|
"learning_rate": 0.000151105898723467, |
|
"loss": 1.743, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.7236659526824951, |
|
"eval_runtime": 76.6784, |
|
"eval_samples_per_second": 65.207, |
|
"eval_steps_per_second": 16.302, |
|
"step": 548 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1644, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 548, |
|
"total_flos": 1.6352549111448207e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|