|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 626, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01597444089456869, |
|
"grad_norm": 478.69457004238836, |
|
"learning_rate": 2e-05, |
|
"loss": 5.0794, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03194888178913738, |
|
"grad_norm": 215.5472393836919, |
|
"learning_rate": 2e-05, |
|
"loss": 1.7552, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04792332268370607, |
|
"grad_norm": 88.51180202926707, |
|
"learning_rate": 2e-05, |
|
"loss": 0.861, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06389776357827476, |
|
"grad_norm": 15.867724265639525, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7908, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07987220447284345, |
|
"grad_norm": 9.962226402862825, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5627, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09584664536741214, |
|
"grad_norm": 11.323650461972006, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4492, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11182108626198083, |
|
"grad_norm": 5.618908250561753, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3863, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12779552715654952, |
|
"grad_norm": 8.639980902230302, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3724, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14376996805111822, |
|
"grad_norm": 6.065581794373812, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3305, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1597444089456869, |
|
"grad_norm": 6.487222993623944, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3454, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1757188498402556, |
|
"grad_norm": 8.157982493659246, |
|
"learning_rate": 2e-05, |
|
"loss": 0.31, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.19169329073482427, |
|
"grad_norm": 4.433439880366275, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3233, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20766773162939298, |
|
"grad_norm": 8.704032511156715, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3254, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22364217252396165, |
|
"grad_norm": 3.2354358181768, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3028, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23961661341853036, |
|
"grad_norm": 3.927058406370219, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2545, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.25559105431309903, |
|
"grad_norm": 4.383347359544785, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2766, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2715654952076677, |
|
"grad_norm": 4.755401718885403, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2756, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.28753993610223644, |
|
"grad_norm": 7.018973526139115, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2579, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3035143769968051, |
|
"grad_norm": 6.272026448721462, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2971, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3194888178913738, |
|
"grad_norm": 4.8079684113378365, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3307, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3354632587859425, |
|
"grad_norm": 4.028493080280556, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2727, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3514376996805112, |
|
"grad_norm": 5.388707606364108, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2822, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36741214057507987, |
|
"grad_norm": 3.730845411810028, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2816, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.38338658146964855, |
|
"grad_norm": 5.819780875953061, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2438, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3993610223642173, |
|
"grad_norm": 5.818771077307558, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2764, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.41533546325878595, |
|
"grad_norm": 5.674449251632924, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2679, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.43130990415335463, |
|
"grad_norm": 3.5139138000890564, |
|
"learning_rate": 2e-05, |
|
"loss": 0.266, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4472843450479233, |
|
"grad_norm": 3.6050594093343644, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2558, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46325878594249204, |
|
"grad_norm": 3.7736226262761248, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2747, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4792332268370607, |
|
"grad_norm": 3.3294463018044382, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2124, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4952076677316294, |
|
"grad_norm": 3.978340934287849, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2626, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5111821086261981, |
|
"grad_norm": 3.7733916384693997, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3012, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5271565495207667, |
|
"grad_norm": 2.475405136211538, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2506, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5431309904153354, |
|
"grad_norm": 2.623200763225571, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2127, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5591054313099042, |
|
"grad_norm": 3.1075207472955797, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2441, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5750798722044729, |
|
"grad_norm": 2.446477613149001, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2124, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5910543130990416, |
|
"grad_norm": 4.2022279283216495, |
|
"learning_rate": 2e-05, |
|
"loss": 0.24, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6070287539936102, |
|
"grad_norm": 3.527771879306774, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2458, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6230031948881789, |
|
"grad_norm": 3.5313927317162133, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2714, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6389776357827476, |
|
"grad_norm": 3.6235305866137546, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2653, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6549520766773163, |
|
"grad_norm": 4.876371447504886, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2373, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.670926517571885, |
|
"grad_norm": 3.5358993905726868, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2205, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6869009584664537, |
|
"grad_norm": 2.4600844043540127, |
|
"learning_rate": 2e-05, |
|
"loss": 0.205, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7028753993610224, |
|
"grad_norm": 4.689947740869789, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2497, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7188498402555911, |
|
"grad_norm": 3.8186352734247073, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2624, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7348242811501597, |
|
"grad_norm": 4.186654907595584, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2046, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7507987220447284, |
|
"grad_norm": 4.618434453667313, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2297, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7667731629392971, |
|
"grad_norm": 1.6540359321412514, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1976, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7827476038338658, |
|
"grad_norm": 2.966359474906274, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2267, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7987220447284346, |
|
"grad_norm": 3.178498309301471, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2015, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8146964856230032, |
|
"grad_norm": 3.0943406181806066, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2088, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8306709265175719, |
|
"grad_norm": 2.601647495877313, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1997, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8466453674121406, |
|
"grad_norm": 2.74734218285866, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2271, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8626198083067093, |
|
"grad_norm": 4.600055126522387, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2188, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8785942492012779, |
|
"grad_norm": 2.854778230115055, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2136, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8945686900958466, |
|
"grad_norm": 5.6767551180163185, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2362, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9105431309904153, |
|
"grad_norm": 2.4685062213282705, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2108, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9265175718849841, |
|
"grad_norm": 4.1197310782397, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2084, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9424920127795527, |
|
"grad_norm": 3.4714190539955085, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2327, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9584664536741214, |
|
"grad_norm": 2.7324693594411613, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2264, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9744408945686901, |
|
"grad_norm": 3.421741611446172, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1995, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9904153354632588, |
|
"grad_norm": 2.9392575520935753, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2168, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0063897763578276, |
|
"grad_norm": 4.705821672718228, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1819, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0223642172523961, |
|
"grad_norm": 2.802997957452102, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1297, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.038338658146965, |
|
"grad_norm": 2.6089187422398505, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1352, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.0543130990415335, |
|
"grad_norm": 2.594169569483709, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1253, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0702875399361023, |
|
"grad_norm": 1.8261982922603672, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1486, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.0862619808306708, |
|
"grad_norm": 2.5000412312569096, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1426, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1022364217252396, |
|
"grad_norm": 2.93075230906697, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1643, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1182108626198084, |
|
"grad_norm": 2.7078413576335727, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1447, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.134185303514377, |
|
"grad_norm": 2.7697654970321186, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1713, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.1501597444089458, |
|
"grad_norm": 2.66165869087924, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1468, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1661341853035143, |
|
"grad_norm": 2.1804729084978476, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1438, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.182108626198083, |
|
"grad_norm": 2.698750197905578, |
|
"learning_rate": 2e-05, |
|
"loss": 0.148, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1980830670926517, |
|
"grad_norm": 1.8617655098022767, |
|
"learning_rate": 2e-05, |
|
"loss": 0.148, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2140575079872205, |
|
"grad_norm": 3.111985859695433, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1687, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.230031948881789, |
|
"grad_norm": 2.1405551391392903, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1509, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.2460063897763578, |
|
"grad_norm": 1.7101559031228233, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1372, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2619808306709266, |
|
"grad_norm": 2.508230095740884, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1616, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.2779552715654952, |
|
"grad_norm": 3.409471738890252, |
|
"learning_rate": 2e-05, |
|
"loss": 0.144, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.293929712460064, |
|
"grad_norm": 1.9545498597656406, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1617, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.3099041533546325, |
|
"grad_norm": 2.5817643016324796, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1499, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3258785942492013, |
|
"grad_norm": 3.8018602676712447, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1728, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.34185303514377, |
|
"grad_norm": 1.6400152311048282, |
|
"learning_rate": 2e-05, |
|
"loss": 0.132, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3578274760383386, |
|
"grad_norm": 2.692964692189351, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1629, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.3738019169329074, |
|
"grad_norm": 2.02381563418916, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1424, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.389776357827476, |
|
"grad_norm": 3.341545646553509, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1458, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.4057507987220448, |
|
"grad_norm": 2.0056870293845854, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1631, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4217252396166133, |
|
"grad_norm": 1.8374219523236817, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1302, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.4376996805111821, |
|
"grad_norm": 2.3773282123090973, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1558, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4536741214057507, |
|
"grad_norm": 8.600724194320007, |
|
"learning_rate": 2e-05, |
|
"loss": 0.159, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.4696485623003195, |
|
"grad_norm": 2.3257620807885595, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1478, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4856230031948883, |
|
"grad_norm": 5.440755721339502, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1733, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.5015974440894568, |
|
"grad_norm": 2.7907485951432056, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1357, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5175718849840254, |
|
"grad_norm": 1.864754084019008, |
|
"learning_rate": 2e-05, |
|
"loss": 0.137, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.5335463258785942, |
|
"grad_norm": 3.6446121307342327, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1546, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.549520766773163, |
|
"grad_norm": 2.796412839091711, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1428, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.5654952076677318, |
|
"grad_norm": 3.946744390862881, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1594, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5814696485623003, |
|
"grad_norm": 1.7503778746807261, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1282, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.5974440894568689, |
|
"grad_norm": 2.5325664670929595, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1469, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6134185303514377, |
|
"grad_norm": 3.8247416342616294, |
|
"learning_rate": 2e-05, |
|
"loss": 0.141, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.6293929712460065, |
|
"grad_norm": 2.2067769830527206, |
|
"learning_rate": 2e-05, |
|
"loss": 0.143, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.645367412140575, |
|
"grad_norm": 1.7741536011263175, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1502, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.6613418530351438, |
|
"grad_norm": 2.2305374723892037, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1639, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6773162939297124, |
|
"grad_norm": 1.9024084336419627, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1404, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.6932907348242812, |
|
"grad_norm": 2.063654895177496, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1658, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.70926517571885, |
|
"grad_norm": 1.7772854814104246, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1494, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.7252396166134185, |
|
"grad_norm": 1.6946546609952169, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1482, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.741214057507987, |
|
"grad_norm": 2.3914840402840887, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1365, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.7571884984025559, |
|
"grad_norm": 2.606233528266447, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1614, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7731629392971247, |
|
"grad_norm": 3.3769203161642944, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1771, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.7891373801916934, |
|
"grad_norm": 2.5560773230753715, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1689, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.805111821086262, |
|
"grad_norm": 1.9776372837776819, |
|
"learning_rate": 2e-05, |
|
"loss": 0.174, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.8210862619808306, |
|
"grad_norm": 1.8570454488081323, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1532, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8370607028753994, |
|
"grad_norm": 1.7205286991600581, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1255, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.8530351437699681, |
|
"grad_norm": 2.569662545324991, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1272, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8690095846645367, |
|
"grad_norm": 4.940727873457326, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1399, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.8849840255591053, |
|
"grad_norm": 2.070744572599559, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1711, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.900958466453674, |
|
"grad_norm": 2.4277203303287727, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1265, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.9169329073482428, |
|
"grad_norm": 1.9498525535666777, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1275, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9329073482428116, |
|
"grad_norm": 2.4996095434445063, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1621, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.9488817891373802, |
|
"grad_norm": 3.2603497712261538, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1349, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9648562300319488, |
|
"grad_norm": 2.2118523751466324, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1513, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.9808306709265175, |
|
"grad_norm": 2.6925399127392624, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1399, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9968051118210863, |
|
"grad_norm": 1.9429261986157622, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1437, |
|
"step": 625 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 626, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 313, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8191979028480.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|