|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 313, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01597444089456869, |
|
"grad_norm": 478.69457004238836, |
|
"learning_rate": 2e-05, |
|
"loss": 5.0794, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03194888178913738, |
|
"grad_norm": 215.5472393836919, |
|
"learning_rate": 2e-05, |
|
"loss": 1.7552, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04792332268370607, |
|
"grad_norm": 88.51180202926707, |
|
"learning_rate": 2e-05, |
|
"loss": 0.861, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06389776357827476, |
|
"grad_norm": 15.867724265639525, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7908, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07987220447284345, |
|
"grad_norm": 9.962226402862825, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5627, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09584664536741214, |
|
"grad_norm": 11.323650461972006, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4492, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11182108626198083, |
|
"grad_norm": 5.618908250561753, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3863, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12779552715654952, |
|
"grad_norm": 8.639980902230302, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3724, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14376996805111822, |
|
"grad_norm": 6.065581794373812, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3305, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1597444089456869, |
|
"grad_norm": 6.487222993623944, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3454, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1757188498402556, |
|
"grad_norm": 8.157982493659246, |
|
"learning_rate": 2e-05, |
|
"loss": 0.31, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.19169329073482427, |
|
"grad_norm": 4.433439880366275, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3233, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20766773162939298, |
|
"grad_norm": 8.704032511156715, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3254, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22364217252396165, |
|
"grad_norm": 3.2354358181768, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3028, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23961661341853036, |
|
"grad_norm": 3.927058406370219, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2545, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.25559105431309903, |
|
"grad_norm": 4.383347359544785, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2766, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2715654952076677, |
|
"grad_norm": 4.755401718885403, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2756, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.28753993610223644, |
|
"grad_norm": 7.018973526139115, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2579, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3035143769968051, |
|
"grad_norm": 6.272026448721462, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2971, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3194888178913738, |
|
"grad_norm": 4.8079684113378365, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3307, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3354632587859425, |
|
"grad_norm": 4.028493080280556, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2727, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3514376996805112, |
|
"grad_norm": 5.388707606364108, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2822, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36741214057507987, |
|
"grad_norm": 3.730845411810028, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2816, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.38338658146964855, |
|
"grad_norm": 5.819780875953061, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2438, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3993610223642173, |
|
"grad_norm": 5.818771077307558, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2764, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.41533546325878595, |
|
"grad_norm": 5.674449251632924, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2679, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.43130990415335463, |
|
"grad_norm": 3.5139138000890564, |
|
"learning_rate": 2e-05, |
|
"loss": 0.266, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4472843450479233, |
|
"grad_norm": 3.6050594093343644, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2558, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46325878594249204, |
|
"grad_norm": 3.7736226262761248, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2747, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4792332268370607, |
|
"grad_norm": 3.3294463018044382, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2124, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4952076677316294, |
|
"grad_norm": 3.978340934287849, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2626, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5111821086261981, |
|
"grad_norm": 3.7733916384693997, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3012, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5271565495207667, |
|
"grad_norm": 2.475405136211538, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2506, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5431309904153354, |
|
"grad_norm": 2.623200763225571, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2127, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5591054313099042, |
|
"grad_norm": 3.1075207472955797, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2441, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5750798722044729, |
|
"grad_norm": 2.446477613149001, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2124, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5910543130990416, |
|
"grad_norm": 4.2022279283216495, |
|
"learning_rate": 2e-05, |
|
"loss": 0.24, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6070287539936102, |
|
"grad_norm": 3.527771879306774, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2458, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6230031948881789, |
|
"grad_norm": 3.5313927317162133, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2714, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6389776357827476, |
|
"grad_norm": 3.6235305866137546, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2653, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6549520766773163, |
|
"grad_norm": 4.876371447504886, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2373, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.670926517571885, |
|
"grad_norm": 3.5358993905726868, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2205, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6869009584664537, |
|
"grad_norm": 2.4600844043540127, |
|
"learning_rate": 2e-05, |
|
"loss": 0.205, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7028753993610224, |
|
"grad_norm": 4.689947740869789, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2497, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7188498402555911, |
|
"grad_norm": 3.8186352734247073, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2624, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7348242811501597, |
|
"grad_norm": 4.186654907595584, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2046, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7507987220447284, |
|
"grad_norm": 4.618434453667313, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2297, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7667731629392971, |
|
"grad_norm": 1.6540359321412514, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1976, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7827476038338658, |
|
"grad_norm": 2.966359474906274, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2267, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7987220447284346, |
|
"grad_norm": 3.178498309301471, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2015, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8146964856230032, |
|
"grad_norm": 3.0943406181806066, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2088, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8306709265175719, |
|
"grad_norm": 2.601647495877313, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1997, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8466453674121406, |
|
"grad_norm": 2.74734218285866, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2271, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8626198083067093, |
|
"grad_norm": 4.600055126522387, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2188, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8785942492012779, |
|
"grad_norm": 2.854778230115055, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2136, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8945686900958466, |
|
"grad_norm": 5.6767551180163185, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2362, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9105431309904153, |
|
"grad_norm": 2.4685062213282705, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2108, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9265175718849841, |
|
"grad_norm": 4.1197310782397, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2084, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9424920127795527, |
|
"grad_norm": 3.4714190539955085, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2327, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9584664536741214, |
|
"grad_norm": 2.7324693594411613, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2264, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9744408945686901, |
|
"grad_norm": 3.421741611446172, |
|
"learning_rate": 2e-05, |
|
"loss": 0.1995, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9904153354632588, |
|
"grad_norm": 2.9392575520935753, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2168, |
|
"step": 310 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 626, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 313, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4095989514240.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|