|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 29.868722148620982, |
|
"eval_steps": 500, |
|
"global_step": 124000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.24087679152113695, |
|
"grad_norm": 0.5082331299781799, |
|
"learning_rate": 0.000991969806472336, |
|
"loss": 1.4852, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4817535830422739, |
|
"grad_norm": 0.4949457347393036, |
|
"learning_rate": 0.0009839396129446719, |
|
"loss": 1.369, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7226303745634108, |
|
"grad_norm": 0.5630651116371155, |
|
"learning_rate": 0.0009759094194170079, |
|
"loss": 1.3189, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9635071660845478, |
|
"grad_norm": 0.5108479261398315, |
|
"learning_rate": 0.000967879225889344, |
|
"loss": 1.2817, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.2043839576056847, |
|
"grad_norm": 0.47163671255111694, |
|
"learning_rate": 0.00095984903236168, |
|
"loss": 1.2511, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4452607491268217, |
|
"grad_norm": 0.531821072101593, |
|
"learning_rate": 0.0009518188388340159, |
|
"loss": 1.2309, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.6861375406479586, |
|
"grad_norm": 0.4891831874847412, |
|
"learning_rate": 0.0009437886453063518, |
|
"loss": 1.2088, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.9270143321690956, |
|
"grad_norm": 0.4778994619846344, |
|
"learning_rate": 0.0009357584517786879, |
|
"loss": 1.1936, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.1678911236902323, |
|
"grad_norm": 0.5436965823173523, |
|
"learning_rate": 0.0009277282582510239, |
|
"loss": 1.1792, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.4087679152113695, |
|
"grad_norm": 0.45789214968681335, |
|
"learning_rate": 0.0009196980647233599, |
|
"loss": 1.1609, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.649644706732506, |
|
"grad_norm": 0.49292871356010437, |
|
"learning_rate": 0.0009116678711956958, |
|
"loss": 1.1465, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.8905214982536434, |
|
"grad_norm": 0.4877796769142151, |
|
"learning_rate": 0.0009036376776680318, |
|
"loss": 1.1339, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.13139828977478, |
|
"grad_norm": 0.45624956488609314, |
|
"learning_rate": 0.0008956074841403679, |
|
"loss": 1.1235, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.3722750812959172, |
|
"grad_norm": 0.45705732703208923, |
|
"learning_rate": 0.0008875772906127038, |
|
"loss": 1.114, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.613151872817054, |
|
"grad_norm": 0.4534723460674286, |
|
"learning_rate": 0.0008795470970850398, |
|
"loss": 1.1022, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.854028664338191, |
|
"grad_norm": 0.5117238163948059, |
|
"learning_rate": 0.0008715169035573758, |
|
"loss": 1.0936, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.094905455859328, |
|
"grad_norm": 0.4832773804664612, |
|
"learning_rate": 0.0008634867100297117, |
|
"loss": 1.0895, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.335782247380465, |
|
"grad_norm": 0.5012445449829102, |
|
"learning_rate": 0.0008554565165020477, |
|
"loss": 1.0793, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.576659038901602, |
|
"grad_norm": 0.5131984353065491, |
|
"learning_rate": 0.0008474263229743838, |
|
"loss": 1.0737, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.817535830422739, |
|
"grad_norm": 0.5085521340370178, |
|
"learning_rate": 0.0008393961294467197, |
|
"loss": 1.068, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 5.058412621943876, |
|
"grad_norm": 0.4861578345298767, |
|
"learning_rate": 0.0008313659359190556, |
|
"loss": 1.0596, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 5.299289413465012, |
|
"grad_norm": 0.4493337869644165, |
|
"learning_rate": 0.0008233357423913917, |
|
"loss": 1.0549, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 5.54016620498615, |
|
"grad_norm": 0.5158481001853943, |
|
"learning_rate": 0.0008153055488637277, |
|
"loss": 1.0501, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 5.781042996507287, |
|
"grad_norm": 0.5318390130996704, |
|
"learning_rate": 0.0008072753553360636, |
|
"loss": 1.0423, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 6.021919788028423, |
|
"grad_norm": 0.4411888122558594, |
|
"learning_rate": 0.0007992451618083995, |
|
"loss": 1.0329, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 6.26279657954956, |
|
"grad_norm": 0.5529501438140869, |
|
"learning_rate": 0.0007912149682807356, |
|
"loss": 1.0304, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 6.503673371070697, |
|
"grad_norm": 0.4464714229106903, |
|
"learning_rate": 0.0007831847747530716, |
|
"loss": 1.0246, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 6.7445501625918345, |
|
"grad_norm": 0.5307004451751709, |
|
"learning_rate": 0.0007751545812254074, |
|
"loss": 1.0229, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 6.985426954112971, |
|
"grad_norm": 0.5123757123947144, |
|
"learning_rate": 0.0007671243876977435, |
|
"loss": 1.0172, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 7.226303745634108, |
|
"grad_norm": 0.47260963916778564, |
|
"learning_rate": 0.0007590941941700795, |
|
"loss": 1.0122, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 7.4671805371552455, |
|
"grad_norm": 0.5172567963600159, |
|
"learning_rate": 0.0007510640006424156, |
|
"loss": 1.0041, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 7.708057328676382, |
|
"grad_norm": 0.5100764036178589, |
|
"learning_rate": 0.0007430338071147515, |
|
"loss": 0.9996, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 7.948934120197519, |
|
"grad_norm": 0.5113524198532104, |
|
"learning_rate": 0.0007350036135870874, |
|
"loss": 0.9962, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 8.189810911718656, |
|
"grad_norm": 0.5284003615379333, |
|
"learning_rate": 0.0007269734200594235, |
|
"loss": 0.9902, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 8.430687703239792, |
|
"grad_norm": 0.45138561725616455, |
|
"learning_rate": 0.0007189432265317595, |
|
"loss": 0.9853, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 8.67156449476093, |
|
"grad_norm": 0.5026872158050537, |
|
"learning_rate": 0.0007109130330040954, |
|
"loss": 0.9817, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 8.912441286282068, |
|
"grad_norm": 0.553321123123169, |
|
"learning_rate": 0.0007028828394764313, |
|
"loss": 0.9842, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 9.153318077803204, |
|
"grad_norm": 0.4765004515647888, |
|
"learning_rate": 0.0006948526459487674, |
|
"loss": 0.9748, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 9.394194869324341, |
|
"grad_norm": 0.4472289979457855, |
|
"learning_rate": 0.0006868224524211034, |
|
"loss": 0.9734, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 9.635071660845478, |
|
"grad_norm": 0.4811370372772217, |
|
"learning_rate": 0.0006787922588934393, |
|
"loss": 0.9671, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 9.875948452366615, |
|
"grad_norm": 0.5080583691596985, |
|
"learning_rate": 0.0006707620653657753, |
|
"loss": 0.9687, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 10.116825243887751, |
|
"grad_norm": 0.49223220348358154, |
|
"learning_rate": 0.0006627318718381113, |
|
"loss": 0.9592, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 10.357702035408888, |
|
"grad_norm": 0.5603600740432739, |
|
"learning_rate": 0.0006547016783104472, |
|
"loss": 0.9559, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 10.598578826930025, |
|
"grad_norm": 0.503847599029541, |
|
"learning_rate": 0.0006466714847827833, |
|
"loss": 0.9533, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 10.839455618451161, |
|
"grad_norm": 0.4978269934654236, |
|
"learning_rate": 0.0006386412912551193, |
|
"loss": 0.9486, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 11.0803324099723, |
|
"grad_norm": 0.5506151914596558, |
|
"learning_rate": 0.0006306110977274552, |
|
"loss": 0.951, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 11.321209201493437, |
|
"grad_norm": 0.5171232223510742, |
|
"learning_rate": 0.0006225809041997912, |
|
"loss": 0.9399, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 11.562085993014573, |
|
"grad_norm": 0.656745970249176, |
|
"learning_rate": 0.0006145507106721272, |
|
"loss": 0.937, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 11.80296278453571, |
|
"grad_norm": 0.5088077783584595, |
|
"learning_rate": 0.0006065205171444633, |
|
"loss": 0.9376, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 12.043839576056847, |
|
"grad_norm": 0.4850046932697296, |
|
"learning_rate": 0.0005984903236167992, |
|
"loss": 0.9302, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 12.284716367577984, |
|
"grad_norm": 0.5553488731384277, |
|
"learning_rate": 0.0005904601300891351, |
|
"loss": 0.9302, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 12.52559315909912, |
|
"grad_norm": 0.47318387031555176, |
|
"learning_rate": 0.0005824299365614711, |
|
"loss": 0.9298, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 12.766469950620257, |
|
"grad_norm": 0.6019132733345032, |
|
"learning_rate": 0.0005743997430338072, |
|
"loss": 0.9241, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 13.007346742141396, |
|
"grad_norm": 0.49550944566726685, |
|
"learning_rate": 0.000566369549506143, |
|
"loss": 0.9236, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 13.248223533662532, |
|
"grad_norm": 0.5007643103599548, |
|
"learning_rate": 0.000558339355978479, |
|
"loss": 0.9196, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 13.489100325183669, |
|
"grad_norm": 0.5525193810462952, |
|
"learning_rate": 0.0005503091624508151, |
|
"loss": 0.9113, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 13.729977116704806, |
|
"grad_norm": 0.5109050273895264, |
|
"learning_rate": 0.0005422789689231511, |
|
"loss": 0.9077, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 13.970853908225942, |
|
"grad_norm": 0.5197868347167969, |
|
"learning_rate": 0.000534248775395487, |
|
"loss": 0.9076, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 14.21173069974708, |
|
"grad_norm": 0.4690844416618347, |
|
"learning_rate": 0.000526218581867823, |
|
"loss": 0.9097, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 14.452607491268216, |
|
"grad_norm": 0.5010888576507568, |
|
"learning_rate": 0.000518188388340159, |
|
"loss": 0.9022, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 14.693484282789353, |
|
"grad_norm": 0.5394883751869202, |
|
"learning_rate": 0.000510158194812495, |
|
"loss": 0.8982, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 14.934361074310491, |
|
"grad_norm": 0.5398752689361572, |
|
"learning_rate": 0.000502128001284831, |
|
"loss": 0.8982, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 15.175237865831628, |
|
"grad_norm": 0.48452773690223694, |
|
"learning_rate": 0.0004940978077571669, |
|
"loss": 0.8937, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 15.416114657352765, |
|
"grad_norm": 0.5147862434387207, |
|
"learning_rate": 0.00048606761422950295, |
|
"loss": 0.894, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 15.656991448873901, |
|
"grad_norm": 0.5301661491394043, |
|
"learning_rate": 0.00047803742070183893, |
|
"loss": 0.8865, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 15.897868240395038, |
|
"grad_norm": 0.49967125058174133, |
|
"learning_rate": 0.0004700072271741749, |
|
"loss": 0.8825, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 16.138745031916176, |
|
"grad_norm": 0.4977249801158905, |
|
"learning_rate": 0.0004619770336465109, |
|
"loss": 0.884, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 16.37962182343731, |
|
"grad_norm": 0.5272189378738403, |
|
"learning_rate": 0.0004539468401188468, |
|
"loss": 0.8799, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 16.62049861495845, |
|
"grad_norm": 0.5125630497932434, |
|
"learning_rate": 0.00044591664659118286, |
|
"loss": 0.8809, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 16.861375406479585, |
|
"grad_norm": 0.4780360460281372, |
|
"learning_rate": 0.00043788645306351885, |
|
"loss": 0.8744, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 17.102252198000723, |
|
"grad_norm": 0.541357159614563, |
|
"learning_rate": 0.00042985625953585483, |
|
"loss": 0.8729, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 17.34312898952186, |
|
"grad_norm": 0.5186867713928223, |
|
"learning_rate": 0.0004218260660081908, |
|
"loss": 0.8673, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 17.584005781042997, |
|
"grad_norm": 0.5490289330482483, |
|
"learning_rate": 0.0004137958724805268, |
|
"loss": 0.8644, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 17.824882572564132, |
|
"grad_norm": 0.5589401125907898, |
|
"learning_rate": 0.0004057656789528628, |
|
"loss": 0.8676, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 18.06575936408527, |
|
"grad_norm": 0.5271314382553101, |
|
"learning_rate": 0.00039773548542519876, |
|
"loss": 0.861, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 18.30663615560641, |
|
"grad_norm": 0.586135983467102, |
|
"learning_rate": 0.00038970529189753474, |
|
"loss": 0.8556, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 18.547512947127544, |
|
"grad_norm": 0.5827994346618652, |
|
"learning_rate": 0.00038167509836987073, |
|
"loss": 0.8565, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 18.788389738648682, |
|
"grad_norm": 0.5451443195343018, |
|
"learning_rate": 0.0003736449048422067, |
|
"loss": 0.8554, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 19.029266530169817, |
|
"grad_norm": 0.6020991206169128, |
|
"learning_rate": 0.0003656147113145427, |
|
"loss": 0.8506, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 19.270143321690956, |
|
"grad_norm": 0.48624420166015625, |
|
"learning_rate": 0.0003575845177868787, |
|
"loss": 0.8449, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 19.51102011321209, |
|
"grad_norm": 0.6073954105377197, |
|
"learning_rate": 0.0003495543242592146, |
|
"loss": 0.8436, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 19.75189690473323, |
|
"grad_norm": 0.5890400409698486, |
|
"learning_rate": 0.00034152413073155064, |
|
"loss": 0.8445, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 19.992773696254368, |
|
"grad_norm": 0.5391818881034851, |
|
"learning_rate": 0.0003334939372038866, |
|
"loss": 0.8405, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 20.233650487775503, |
|
"grad_norm": 0.5576732158660889, |
|
"learning_rate": 0.0003254637436762226, |
|
"loss": 0.8315, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 20.47452727929664, |
|
"grad_norm": 0.5405558347702026, |
|
"learning_rate": 0.0003174335501485586, |
|
"loss": 0.8348, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 20.715404070817776, |
|
"grad_norm": 0.5441027879714966, |
|
"learning_rate": 0.0003094033566208946, |
|
"loss": 0.8326, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 20.956280862338915, |
|
"grad_norm": 0.5216940641403198, |
|
"learning_rate": 0.00030137316309323056, |
|
"loss": 0.8251, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 21.19715765386005, |
|
"grad_norm": 0.6281733512878418, |
|
"learning_rate": 0.00029334296956556654, |
|
"loss": 0.8219, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 21.438034445381188, |
|
"grad_norm": 0.6415626406669617, |
|
"learning_rate": 0.0002853127760379025, |
|
"loss": 0.8225, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 21.678911236902323, |
|
"grad_norm": 0.5944454073905945, |
|
"learning_rate": 0.0002772825825102385, |
|
"loss": 0.8223, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 21.91978802842346, |
|
"grad_norm": 0.5434209704399109, |
|
"learning_rate": 0.0002692523889825745, |
|
"loss": 0.8161, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 22.1606648199446, |
|
"grad_norm": 0.5393619537353516, |
|
"learning_rate": 0.0002612221954549105, |
|
"loss": 0.8126, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 22.401541611465735, |
|
"grad_norm": 0.6141464710235596, |
|
"learning_rate": 0.00025319200192724646, |
|
"loss": 0.8119, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 22.642418402986873, |
|
"grad_norm": 0.6085337400436401, |
|
"learning_rate": 0.00024516180839958244, |
|
"loss": 0.8082, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 22.88329519450801, |
|
"grad_norm": 0.6050975322723389, |
|
"learning_rate": 0.00023713161487191843, |
|
"loss": 0.8065, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 23.124171986029147, |
|
"grad_norm": 0.5710690021514893, |
|
"learning_rate": 0.0002291014213442544, |
|
"loss": 0.7971, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 23.36504877755028, |
|
"grad_norm": 0.5681021213531494, |
|
"learning_rate": 0.0002210712278165904, |
|
"loss": 0.7996, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 23.60592556907142, |
|
"grad_norm": 0.5795422792434692, |
|
"learning_rate": 0.00021304103428892638, |
|
"loss": 0.8006, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 23.846802360592555, |
|
"grad_norm": 0.6810296773910522, |
|
"learning_rate": 0.00020501084076126236, |
|
"loss": 0.7931, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 24.087679152113694, |
|
"grad_norm": 0.5023326277732849, |
|
"learning_rate": 0.00019698064723359831, |
|
"loss": 0.7895, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 24.328555943634832, |
|
"grad_norm": 0.6325027346611023, |
|
"learning_rate": 0.0001889504537059343, |
|
"loss": 0.7902, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 24.569432735155967, |
|
"grad_norm": 0.5804798007011414, |
|
"learning_rate": 0.00018092026017827028, |
|
"loss": 0.7912, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 24.810309526677106, |
|
"grad_norm": 0.5688096284866333, |
|
"learning_rate": 0.00017289006665060626, |
|
"loss": 0.7886, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 25.05118631819824, |
|
"grad_norm": 0.5234955549240112, |
|
"learning_rate": 0.00016485987312294225, |
|
"loss": 0.7828, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 25.29206310971938, |
|
"grad_norm": 0.5501936078071594, |
|
"learning_rate": 0.00015682967959527826, |
|
"loss": 0.7769, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 25.532939901240514, |
|
"grad_norm": 0.6095595359802246, |
|
"learning_rate": 0.00014879948606761424, |
|
"loss": 0.7788, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 25.773816692761653, |
|
"grad_norm": 0.6600815057754517, |
|
"learning_rate": 0.00014076929253995022, |
|
"loss": 0.778, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 26.01469348428279, |
|
"grad_norm": 0.6249046921730042, |
|
"learning_rate": 0.0001327390990122862, |
|
"loss": 0.7701, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 26.255570275803926, |
|
"grad_norm": 0.7678042054176331, |
|
"learning_rate": 0.0001247089054846222, |
|
"loss": 0.7661, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 26.496447067325064, |
|
"grad_norm": 0.5331607460975647, |
|
"learning_rate": 0.00011667871195695816, |
|
"loss": 0.7689, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 26.7373238588462, |
|
"grad_norm": 0.6330080628395081, |
|
"learning_rate": 0.00010864851842929414, |
|
"loss": 0.7655, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 26.978200650367338, |
|
"grad_norm": 0.6966120004653931, |
|
"learning_rate": 0.00010061832490163013, |
|
"loss": 0.7593, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 27.219077441888473, |
|
"grad_norm": 0.6162911653518677, |
|
"learning_rate": 9.258813137396611e-05, |
|
"loss": 0.7619, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 27.45995423340961, |
|
"grad_norm": 0.6430843472480774, |
|
"learning_rate": 8.45579378463021e-05, |
|
"loss": 0.7598, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 27.700831024930746, |
|
"grad_norm": 0.6248263120651245, |
|
"learning_rate": 7.652774431863809e-05, |
|
"loss": 0.7584, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 27.941707816451885, |
|
"grad_norm": 0.6525952219963074, |
|
"learning_rate": 6.849755079097407e-05, |
|
"loss": 0.7536, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 28.182584607973023, |
|
"grad_norm": 0.64836186170578, |
|
"learning_rate": 6.046735726331005e-05, |
|
"loss": 0.7535, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 28.42346139949416, |
|
"grad_norm": 0.5937727689743042, |
|
"learning_rate": 5.243716373564603e-05, |
|
"loss": 0.7453, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 28.664338191015297, |
|
"grad_norm": 0.5951708555221558, |
|
"learning_rate": 4.440697020798201e-05, |
|
"loss": 0.7462, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 28.90521498253643, |
|
"grad_norm": 0.5529988408088684, |
|
"learning_rate": 3.6376776680318e-05, |
|
"loss": 0.7478, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 29.14609177405757, |
|
"grad_norm": 0.5422804355621338, |
|
"learning_rate": 2.834658315265398e-05, |
|
"loss": 0.7454, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 29.386968565578705, |
|
"grad_norm": 0.5853854417800903, |
|
"learning_rate": 2.0316389624989963e-05, |
|
"loss": 0.7424, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 29.627845357099844, |
|
"grad_norm": 0.6154918074607849, |
|
"learning_rate": 1.2286196097325946e-05, |
|
"loss": 0.7431, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 29.868722148620982, |
|
"grad_norm": 0.596747875213623, |
|
"learning_rate": 4.256002569661929e-06, |
|
"loss": 0.7426, |
|
"step": 124000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 124530, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2404512265548595e+18, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|