|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.6199095022624435, |
|
"eval_steps": 1000, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01809954751131222, |
|
"grad_norm": 4.442032814025879, |
|
"learning_rate": 0.0, |
|
"loss": 1.497, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03619909502262444, |
|
"grad_norm": 4.569097995758057, |
|
"learning_rate": 1.5051499783199055e-07, |
|
"loss": 1.4506, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05429864253393665, |
|
"grad_norm": 3.9136922359466553, |
|
"learning_rate": 2.385606273598312e-07, |
|
"loss": 1.2728, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07239819004524888, |
|
"grad_norm": 3.9136922359466553, |
|
"learning_rate": 2.385606273598312e-07, |
|
"loss": 1.587, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.09049773755656108, |
|
"grad_norm": 3.9136922359466553, |
|
"learning_rate": 2.385606273598312e-07, |
|
"loss": 1.408, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1085972850678733, |
|
"grad_norm": 4.174969673156738, |
|
"learning_rate": 3.010299956639811e-07, |
|
"loss": 1.3948, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.12669683257918551, |
|
"grad_norm": 4.174969673156738, |
|
"learning_rate": 3.010299956639811e-07, |
|
"loss": 1.515, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.14479638009049775, |
|
"grad_norm": 4.5925774574279785, |
|
"learning_rate": 3.494850021680093e-07, |
|
"loss": 1.5149, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.16289592760180996, |
|
"grad_norm": 4.349692344665527, |
|
"learning_rate": 3.8907562519182173e-07, |
|
"loss": 1.5055, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.18099547511312217, |
|
"grad_norm": 5.4546027183532715, |
|
"learning_rate": 4.2254902000712834e-07, |
|
"loss": 1.4242, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.19909502262443438, |
|
"grad_norm": 4.096962928771973, |
|
"learning_rate": 4.5154499349597166e-07, |
|
"loss": 1.4094, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.2171945701357466, |
|
"grad_norm": 5.804754734039307, |
|
"learning_rate": 4.771212547196623e-07, |
|
"loss": 1.6699, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 4.950932502746582, |
|
"learning_rate": 4.999999999999999e-07, |
|
"loss": 1.4682, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.25339366515837103, |
|
"grad_norm": 4.728433609008789, |
|
"learning_rate": 5.206963425791124e-07, |
|
"loss": 1.3831, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.27149321266968324, |
|
"grad_norm": 4.042943000793457, |
|
"learning_rate": 5.395906230238123e-07, |
|
"loss": 1.4438, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2895927601809955, |
|
"grad_norm": 3.768977642059326, |
|
"learning_rate": 5.569716761534182e-07, |
|
"loss": 1.3996, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 4.375296115875244, |
|
"learning_rate": 5.730640178391189e-07, |
|
"loss": 1.3065, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3257918552036199, |
|
"grad_norm": 3.959031105041504, |
|
"learning_rate": 5.880456295278405e-07, |
|
"loss": 1.4533, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3438914027149321, |
|
"grad_norm": 4.150556564331055, |
|
"learning_rate": 6.020599913279622e-07, |
|
"loss": 1.4256, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.36199095022624433, |
|
"grad_norm": 5.503087520599365, |
|
"learning_rate": 6.15224460689137e-07, |
|
"loss": 1.3212, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.38009049773755654, |
|
"grad_norm": 5.915998935699463, |
|
"learning_rate": 6.276362525516529e-07, |
|
"loss": 1.4347, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.39819004524886875, |
|
"grad_norm": 3.7581424713134766, |
|
"learning_rate": 6.393768004764143e-07, |
|
"loss": 1.3012, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.416289592760181, |
|
"grad_norm": 3.728571653366089, |
|
"learning_rate": 6.505149978319905e-07, |
|
"loss": 1.3426, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.4343891402714932, |
|
"grad_norm": 4.000448226928711, |
|
"learning_rate": 6.611096473669595e-07, |
|
"loss": 1.3064, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.45248868778280543, |
|
"grad_norm": 3.8997511863708496, |
|
"learning_rate": 6.712113404111031e-07, |
|
"loss": 1.4217, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 3.4601259231567383, |
|
"learning_rate": 6.808639180087963e-07, |
|
"loss": 1.278, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.48868778280542985, |
|
"grad_norm": 3.3370354175567627, |
|
"learning_rate": 6.901056208558029e-07, |
|
"loss": 1.402, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5067873303167421, |
|
"grad_norm": 4.348806381225586, |
|
"learning_rate": 6.989700043360186e-07, |
|
"loss": 1.3881, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5248868778280543, |
|
"grad_norm": 3.3785560131073, |
|
"learning_rate": 7.074866739854088e-07, |
|
"loss": 1.2949, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5429864253393665, |
|
"grad_norm": 3.269310474395752, |
|
"learning_rate": 7.156818820794935e-07, |
|
"loss": 1.3877, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5610859728506787, |
|
"grad_norm": 3.2553048133850098, |
|
"learning_rate": 7.235790156711094e-07, |
|
"loss": 1.3625, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.579185520361991, |
|
"grad_norm": 4.357567310333252, |
|
"learning_rate": 7.311989989494779e-07, |
|
"loss": 1.2856, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5972850678733032, |
|
"grad_norm": 3.816030263900757, |
|
"learning_rate": 7.38560627359831e-07, |
|
"loss": 1.2666, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 3.816030263900757, |
|
"learning_rate": 7.38560627359831e-07, |
|
"loss": 1.288, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6334841628959276, |
|
"grad_norm": 3.095264434814453, |
|
"learning_rate": 7.456808469171361e-07, |
|
"loss": 1.3014, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6515837104072398, |
|
"grad_norm": 5.308295726776123, |
|
"learning_rate": 7.525749891599529e-07, |
|
"loss": 1.3896, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.669683257918552, |
|
"grad_norm": 3.2766010761260986, |
|
"learning_rate": 7.592569699389436e-07, |
|
"loss": 1.3191, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6877828054298643, |
|
"grad_norm": 3.6091532707214355, |
|
"learning_rate": 7.657394585211274e-07, |
|
"loss": 1.3227, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 4.882140159606934, |
|
"learning_rate": 7.720340221751376e-07, |
|
"loss": 1.2763, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7239819004524887, |
|
"grad_norm": 3.5777502059936523, |
|
"learning_rate": 7.781512503836435e-07, |
|
"loss": 1.2811, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7420814479638009, |
|
"grad_norm": 3.520531415939331, |
|
"learning_rate": 7.841008620334974e-07, |
|
"loss": 1.3677, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.7601809954751131, |
|
"grad_norm": 4.111598014831543, |
|
"learning_rate": 7.89891798308405e-07, |
|
"loss": 1.3009, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7782805429864253, |
|
"grad_norm": 3.3306169509887695, |
|
"learning_rate": 7.955323035132494e-07, |
|
"loss": 1.1605, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.7963800904977375, |
|
"grad_norm": 3.112687349319458, |
|
"learning_rate": 8.01029995663981e-07, |
|
"loss": 1.2589, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8144796380090498, |
|
"grad_norm": 3.7117624282836914, |
|
"learning_rate": 8.063919283598676e-07, |
|
"loss": 1.2737, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.832579185520362, |
|
"grad_norm": 4.166692733764648, |
|
"learning_rate": 8.116246451989502e-07, |
|
"loss": 1.2353, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.8506787330316742, |
|
"grad_norm": 6.097745895385742, |
|
"learning_rate": 8.16734227789793e-07, |
|
"loss": 1.221, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.8687782805429864, |
|
"grad_norm": 3.3959226608276367, |
|
"learning_rate": 8.217263382430935e-07, |
|
"loss": 1.2574, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.8868778280542986, |
|
"grad_norm": 4.395689964294434, |
|
"learning_rate": 8.266062568876716e-07, |
|
"loss": 1.1438, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.9049773755656109, |
|
"grad_norm": 3.479315757751465, |
|
"learning_rate": 8.313789158407869e-07, |
|
"loss": 1.2905, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 3.4640543460845947, |
|
"learning_rate": 8.360489289678585e-07, |
|
"loss": 1.2397, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 5.382558822631836, |
|
"learning_rate": 8.406206186877934e-07, |
|
"loss": 1.2131, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.9592760180995475, |
|
"grad_norm": 5.364609718322754, |
|
"learning_rate": 8.450980400142567e-07, |
|
"loss": 1.2098, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.9773755656108597, |
|
"grad_norm": 3.814605951309204, |
|
"learning_rate": 8.494850021680092e-07, |
|
"loss": 1.2415, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.995475113122172, |
|
"grad_norm": 2.9692606925964355, |
|
"learning_rate": 8.53785088048968e-07, |
|
"loss": 1.1955, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0135746606334841, |
|
"grad_norm": 3.130876302719116, |
|
"learning_rate": 8.580016718173995e-07, |
|
"loss": 1.158, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.0316742081447963, |
|
"grad_norm": 3.4549965858459473, |
|
"learning_rate": 8.621379348003944e-07, |
|
"loss": 1.2838, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.0497737556561086, |
|
"grad_norm": 3.4735677242279053, |
|
"learning_rate": 8.661968799114842e-07, |
|
"loss": 1.0405, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.0678733031674208, |
|
"grad_norm": 4.529227256774902, |
|
"learning_rate": 8.701813447471218e-07, |
|
"loss": 1.1188, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.085972850678733, |
|
"grad_norm": 3.53712797164917, |
|
"learning_rate": 8.740940135031001e-07, |
|
"loss": 1.2113, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1040723981900453, |
|
"grad_norm": 3.5053398609161377, |
|
"learning_rate": 8.779374278362456e-07, |
|
"loss": 1.197, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.1221719457013575, |
|
"grad_norm": 2.57523512840271, |
|
"learning_rate": 8.817139967814684e-07, |
|
"loss": 1.0894, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.1402714932126696, |
|
"grad_norm": 3.3335084915161133, |
|
"learning_rate": 8.854260058210719e-07, |
|
"loss": 1.026, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.1583710407239818, |
|
"grad_norm": 3.1159112453460693, |
|
"learning_rate": 8.890756251918216e-07, |
|
"loss": 1.1336, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 4.9009294509887695, |
|
"learning_rate": 8.926649175053833e-07, |
|
"loss": 1.1167, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.1945701357466063, |
|
"grad_norm": 3.9954421520233154, |
|
"learning_rate": 8.961958447491268e-07, |
|
"loss": 1.0955, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.2126696832579185, |
|
"grad_norm": 4.976899147033691, |
|
"learning_rate": 8.996702747267907e-07, |
|
"loss": 1.1722, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 4.318509101867676, |
|
"learning_rate": 9.030899869919433e-07, |
|
"loss": 1.0919, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.248868778280543, |
|
"grad_norm": 3.509249210357666, |
|
"learning_rate": 9.064566783214276e-07, |
|
"loss": 1.1013, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.2669683257918551, |
|
"grad_norm": 2.86033034324646, |
|
"learning_rate": 9.097719677709341e-07, |
|
"loss": 1.0721, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.2850678733031673, |
|
"grad_norm": 2.95745587348938, |
|
"learning_rate": 9.13037401350413e-07, |
|
"loss": 1.1235, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.3031674208144797, |
|
"grad_norm": 3.4295952320098877, |
|
"learning_rate": 9.162544563531181e-07, |
|
"loss": 1.1075, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.3212669683257918, |
|
"grad_norm": 4.3247575759887695, |
|
"learning_rate": 9.194245453686276e-07, |
|
"loss": 1.0976, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.3393665158371042, |
|
"grad_norm": 3.853940010070801, |
|
"learning_rate": 9.225490200071283e-07, |
|
"loss": 0.997, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.3574660633484164, |
|
"grad_norm": 3.1195785999298096, |
|
"learning_rate": 9.256291743595375e-07, |
|
"loss": 1.1499, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.3755656108597285, |
|
"grad_norm": 3.3039655685424805, |
|
"learning_rate": 9.28666248215634e-07, |
|
"loss": 1.0822, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.3936651583710407, |
|
"grad_norm": 2.9753613471984863, |
|
"learning_rate": 9.316614300602277e-07, |
|
"loss": 1.0524, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 2.7271316051483154, |
|
"learning_rate": 9.346158598654879e-07, |
|
"loss": 1.0233, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.4298642533936652, |
|
"grad_norm": 5.75250244140625, |
|
"learning_rate": 9.375306316958498e-07, |
|
"loss": 1.0652, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.4479638009049773, |
|
"grad_norm": 3.271881341934204, |
|
"learning_rate": 9.404067961403955e-07, |
|
"loss": 0.9659, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.4660633484162897, |
|
"grad_norm": 3.8478872776031494, |
|
"learning_rate": 9.432453625862408e-07, |
|
"loss": 1.1446, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.4841628959276019, |
|
"grad_norm": 2.925736904144287, |
|
"learning_rate": 9.4604730134524e-07, |
|
"loss": 0.9846, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.502262443438914, |
|
"grad_norm": 2.8262815475463867, |
|
"learning_rate": 9.488135456452205e-07, |
|
"loss": 0.9395, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.5203619909502262, |
|
"grad_norm": 4.063136100769043, |
|
"learning_rate": 9.515449934959715e-07, |
|
"loss": 1.0066, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 2.5695972442626953, |
|
"learning_rate": 9.542425094393247e-07, |
|
"loss": 0.9592, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.5565610859728507, |
|
"grad_norm": 3.1798267364501953, |
|
"learning_rate": 9.569069261918583e-07, |
|
"loss": 0.9213, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.5746606334841629, |
|
"grad_norm": 6.351828098297119, |
|
"learning_rate": 9.59539046188037e-07, |
|
"loss": 0.9416, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.5927601809954752, |
|
"grad_norm": 2.9044878482818604, |
|
"learning_rate": 9.621396430309406e-07, |
|
"loss": 0.9626, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.6108597285067874, |
|
"grad_norm": 3.130195140838623, |
|
"learning_rate": 9.647094628571462e-07, |
|
"loss": 0.9767, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.6289592760180995, |
|
"grad_norm": 6.14766263961792, |
|
"learning_rate": 9.672492256217836e-07, |
|
"loss": 1.029, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.6470588235294117, |
|
"grad_norm": 3.2492759227752686, |
|
"learning_rate": 9.69759626309309e-07, |
|
"loss": 0.9621, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.6651583710407238, |
|
"grad_norm": 2.8760759830474854, |
|
"learning_rate": 9.722413360750842e-07, |
|
"loss": 1.0525, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.6832579185520362, |
|
"grad_norm": 2.910680055618286, |
|
"learning_rate": 9.74695003322456e-07, |
|
"loss": 0.8777, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.7013574660633484, |
|
"grad_norm": 3.2539334297180176, |
|
"learning_rate": 9.771212547196622e-07, |
|
"loss": 0.9407, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.7194570135746607, |
|
"grad_norm": 2.7406673431396484, |
|
"learning_rate": 9.795206961605466e-07, |
|
"loss": 0.9077, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.737556561085973, |
|
"grad_norm": 3.0724148750305176, |
|
"learning_rate": 9.818939136727774e-07, |
|
"loss": 0.8289, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.755656108597285, |
|
"grad_norm": 3.2282042503356934, |
|
"learning_rate": 9.842414742769674e-07, |
|
"loss": 0.9638, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.7737556561085972, |
|
"grad_norm": 6.185736656188965, |
|
"learning_rate": 9.865639267998492e-07, |
|
"loss": 0.9356, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.7918552036199094, |
|
"grad_norm": 3.5250043869018555, |
|
"learning_rate": 9.888618026444236e-07, |
|
"loss": 0.8963, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.8099547511312217, |
|
"grad_norm": 3.418933629989624, |
|
"learning_rate": 9.91135616519784e-07, |
|
"loss": 0.9555, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.8280542986425339, |
|
"grad_norm": 2.943357467651367, |
|
"learning_rate": 9.933858671331222e-07, |
|
"loss": 0.8866, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 3.657294511795044, |
|
"learning_rate": 9.956130378462473e-07, |
|
"loss": 0.9626, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.8642533936651584, |
|
"grad_norm": 3.3619449138641357, |
|
"learning_rate": 9.978175972987748e-07, |
|
"loss": 0.8601, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 3.6125662326812744, |
|
"learning_rate": 9.999999999999997e-07, |
|
"loss": 0.8527, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.9004524886877827, |
|
"grad_norm": 3.5924038887023926, |
|
"learning_rate": 1e-06, |
|
"loss": 0.879, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.9185520361990949, |
|
"grad_norm": 2.770998001098633, |
|
"learning_rate": 1e-06, |
|
"loss": 0.8364, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.9366515837104072, |
|
"grad_norm": 3.2162160873413086, |
|
"learning_rate": 1e-06, |
|
"loss": 0.8591, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.9547511312217196, |
|
"grad_norm": 5.3574910163879395, |
|
"learning_rate": 1e-06, |
|
"loss": 0.9122, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.9728506787330318, |
|
"grad_norm": 3.564077138900757, |
|
"learning_rate": 1e-06, |
|
"loss": 0.9706, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.990950226244344, |
|
"grad_norm": 3.347325086593628, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7614, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.009049773755656, |
|
"grad_norm": 2.967604637145996, |
|
"learning_rate": 1e-06, |
|
"loss": 0.8534, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.0271493212669682, |
|
"grad_norm": 2.7884786128997803, |
|
"learning_rate": 1e-06, |
|
"loss": 0.8786, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.0452488687782804, |
|
"grad_norm": 3.3380022048950195, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7821, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.0633484162895925, |
|
"grad_norm": 2.7749805450439453, |
|
"learning_rate": 1e-06, |
|
"loss": 0.749, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.081447963800905, |
|
"grad_norm": 2.6747641563415527, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7501, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.0995475113122173, |
|
"grad_norm": 3.078357458114624, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7979, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 5.836832523345947, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7917, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.1357466063348416, |
|
"grad_norm": 6.190191745758057, |
|
"learning_rate": 1e-06, |
|
"loss": 0.8743, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 3.3209612369537354, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7989, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.171945701357466, |
|
"grad_norm": 3.5375821590423584, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7356, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.1900452488687785, |
|
"grad_norm": 2.635437488555908, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7602, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.2081447963800906, |
|
"grad_norm": 4.8566460609436035, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7611, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.226244343891403, |
|
"grad_norm": 3.1864778995513916, |
|
"learning_rate": 1e-06, |
|
"loss": 0.767, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.244343891402715, |
|
"grad_norm": 4.00085973739624, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7164, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.262443438914027, |
|
"grad_norm": 3.608243465423584, |
|
"learning_rate": 1e-06, |
|
"loss": 0.8259, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.2805429864253393, |
|
"grad_norm": 2.6522486209869385, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6852, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.2986425339366514, |
|
"grad_norm": 3.137711524963379, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7703, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.3167420814479636, |
|
"grad_norm": 3.867400884628296, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7472, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.334841628959276, |
|
"grad_norm": 3.147169351577759, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6673, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 5.177737236022949, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7478, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.3710407239819005, |
|
"grad_norm": 4.302467346191406, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7025, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.3891402714932126, |
|
"grad_norm": 3.5397591590881348, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6615, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.4072398190045248, |
|
"grad_norm": 5.074939250946045, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7043, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.425339366515837, |
|
"grad_norm": 7.0850090980529785, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6587, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.4434389140271495, |
|
"grad_norm": 3.6032073497772217, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6726, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 2.707559108734131, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6529, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.479638009049774, |
|
"grad_norm": 3.5912561416625977, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6655, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.497737556561086, |
|
"grad_norm": 3.537712335586548, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6617, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.515837104072398, |
|
"grad_norm": 8.936192512512207, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6757, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.5339366515837103, |
|
"grad_norm": 2.4215872287750244, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6143, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.5520361990950224, |
|
"grad_norm": 3.1526424884796143, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6308, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.5701357466063346, |
|
"grad_norm": 4.358147144317627, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7556, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.588235294117647, |
|
"grad_norm": 3.423694610595703, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5853, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.6063348416289593, |
|
"grad_norm": 3.792518138885498, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6154, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.6244343891402715, |
|
"grad_norm": 7.347434997558594, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5983, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.6425339366515836, |
|
"grad_norm": 3.5371973514556885, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6173, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.660633484162896, |
|
"grad_norm": 3.084972620010376, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5556, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.6787330316742084, |
|
"grad_norm": 5.569381237030029, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6146, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.6968325791855206, |
|
"grad_norm": 3.270057201385498, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5938, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.7149321266968327, |
|
"grad_norm": 3.9021122455596924, |
|
"learning_rate": 1e-06, |
|
"loss": 0.532, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.733031674208145, |
|
"grad_norm": 3.702974319458008, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5419, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.751131221719457, |
|
"grad_norm": 3.640477418899536, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5927, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 4.119628429412842, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6556, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.7873303167420813, |
|
"grad_norm": 4.196484565734863, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5473, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.8054298642533935, |
|
"grad_norm": 3.049004316329956, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5382, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 3.8105475902557373, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5667, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.841628959276018, |
|
"grad_norm": 7.120466232299805, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5815, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.8597285067873304, |
|
"grad_norm": 2.96946120262146, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5488, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.8778280542986425, |
|
"grad_norm": 3.927828073501587, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5051, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.8959276018099547, |
|
"grad_norm": 3.6861846446990967, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5683, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.914027149321267, |
|
"grad_norm": 3.383025646209717, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4956, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.9321266968325794, |
|
"grad_norm": 3.9769487380981445, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5101, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.9502262443438916, |
|
"grad_norm": 3.262488842010498, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5589, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.9683257918552037, |
|
"grad_norm": 3.582789182662964, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5064, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.986425339366516, |
|
"grad_norm": 3.441208839416504, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4696, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.004524886877828, |
|
"grad_norm": 4.338072299957275, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5083, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.02262443438914, |
|
"grad_norm": 3.537062644958496, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5328, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.0407239819004523, |
|
"grad_norm": 3.4206771850585938, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4615, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.0588235294117645, |
|
"grad_norm": 3.4206771850585938, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4958, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 3.015188694000244, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5143, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.0950226244343892, |
|
"grad_norm": 3.314436435699463, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4972, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 3.1131221719457014, |
|
"grad_norm": 4.298553466796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4794, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.1312217194570136, |
|
"grad_norm": 4.354773044586182, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4287, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 3.1493212669683257, |
|
"grad_norm": 3.558988332748413, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4092, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.167420814479638, |
|
"grad_norm": 5.127811431884766, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4465, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.1855203619909505, |
|
"grad_norm": 2.5903167724609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4119, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.2036199095022626, |
|
"grad_norm": 3.8117196559906006, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4529, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 3.2217194570135748, |
|
"grad_norm": 3.164416551589966, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4856, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.239819004524887, |
|
"grad_norm": 4.106442451477051, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4312, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 3.257918552036199, |
|
"grad_norm": 3.762195110321045, |
|
"learning_rate": 1e-06, |
|
"loss": 0.411, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.276018099547511, |
|
"grad_norm": 3.0889034271240234, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3977, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 3.2941176470588234, |
|
"grad_norm": 3.6932199001312256, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4528, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.3122171945701355, |
|
"grad_norm": 3.405756711959839, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4042, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 3.330316742081448, |
|
"grad_norm": 5.229561805725098, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4171, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.3484162895927603, |
|
"grad_norm": 5.6756439208984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3891, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.3665158371040724, |
|
"grad_norm": 3.706697940826416, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3673, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 2.9856503009796143, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3989, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 3.4027149321266967, |
|
"grad_norm": 3.5144922733306885, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3885, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.420814479638009, |
|
"grad_norm": 5.046453475952148, |
|
"learning_rate": 1e-06, |
|
"loss": 0.402, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 3.4389140271493215, |
|
"grad_norm": 4.306224822998047, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3568, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.4570135746606336, |
|
"grad_norm": 2.7284157276153564, |
|
"learning_rate": 1e-06, |
|
"loss": 0.4109, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 3.475113122171946, |
|
"grad_norm": 5.036966323852539, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3958, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.493212669683258, |
|
"grad_norm": 2.3863677978515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3674, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 3.51131221719457, |
|
"grad_norm": 3.3838043212890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3627, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 6.964299201965332, |
|
"learning_rate": 1e-06, |
|
"loss": 0.405, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.5475113122171944, |
|
"grad_norm": 2.7131292819976807, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3599, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.5656108597285066, |
|
"grad_norm": 2.6383941173553467, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3335, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.583710407239819, |
|
"grad_norm": 2.710670232772827, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3979, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.6018099547511313, |
|
"grad_norm": 2.6803207397460938, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3695, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 3.6199095022624435, |
|
"grad_norm": 3.22158145904541, |
|
"learning_rate": 1e-06, |
|
"loss": 0.3832, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 182, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1675256582811156e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|