{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01597444089456869, "grad_norm": 478.69457004238836, "learning_rate": 2e-05, "loss": 5.0794, "step": 5 }, { "epoch": 0.03194888178913738, "grad_norm": 215.5472393836919, "learning_rate": 2e-05, "loss": 1.7552, "step": 10 }, { "epoch": 0.04792332268370607, "grad_norm": 88.51180202926707, "learning_rate": 2e-05, "loss": 0.861, "step": 15 }, { "epoch": 0.06389776357827476, "grad_norm": 15.867724265639525, "learning_rate": 2e-05, "loss": 0.7908, "step": 20 }, { "epoch": 0.07987220447284345, "grad_norm": 9.962226402862825, "learning_rate": 2e-05, "loss": 0.5627, "step": 25 }, { "epoch": 0.09584664536741214, "grad_norm": 11.323650461972006, "learning_rate": 2e-05, "loss": 0.4492, "step": 30 }, { "epoch": 0.11182108626198083, "grad_norm": 5.618908250561753, "learning_rate": 2e-05, "loss": 0.3863, "step": 35 }, { "epoch": 0.12779552715654952, "grad_norm": 8.639980902230302, "learning_rate": 2e-05, "loss": 0.3724, "step": 40 }, { "epoch": 0.14376996805111822, "grad_norm": 6.065581794373812, "learning_rate": 2e-05, "loss": 0.3305, "step": 45 }, { "epoch": 0.1597444089456869, "grad_norm": 6.487222993623944, "learning_rate": 2e-05, "loss": 0.3454, "step": 50 }, { "epoch": 0.1757188498402556, "grad_norm": 8.157982493659246, "learning_rate": 2e-05, "loss": 0.31, "step": 55 }, { "epoch": 0.19169329073482427, "grad_norm": 4.433439880366275, "learning_rate": 2e-05, "loss": 0.3233, "step": 60 }, { "epoch": 0.20766773162939298, "grad_norm": 8.704032511156715, "learning_rate": 2e-05, "loss": 0.3254, "step": 65 }, { "epoch": 0.22364217252396165, "grad_norm": 3.2354358181768, "learning_rate": 2e-05, "loss": 0.3028, "step": 70 }, { "epoch": 0.23961661341853036, "grad_norm": 3.927058406370219, "learning_rate": 2e-05, "loss": 0.2545, "step": 75 }, { "epoch": 0.25559105431309903, "grad_norm": 4.383347359544785, "learning_rate": 2e-05, "loss": 0.2766, "step": 80 }, { "epoch": 0.2715654952076677, "grad_norm": 4.755401718885403, "learning_rate": 2e-05, "loss": 0.2756, "step": 85 }, { "epoch": 0.28753993610223644, "grad_norm": 7.018973526139115, "learning_rate": 2e-05, "loss": 0.2579, "step": 90 }, { "epoch": 0.3035143769968051, "grad_norm": 6.272026448721462, "learning_rate": 2e-05, "loss": 0.2971, "step": 95 }, { "epoch": 0.3194888178913738, "grad_norm": 4.8079684113378365, "learning_rate": 2e-05, "loss": 0.3307, "step": 100 }, { "epoch": 0.3354632587859425, "grad_norm": 4.028493080280556, "learning_rate": 2e-05, "loss": 0.2727, "step": 105 }, { "epoch": 0.3514376996805112, "grad_norm": 5.388707606364108, "learning_rate": 2e-05, "loss": 0.2822, "step": 110 }, { "epoch": 0.36741214057507987, "grad_norm": 3.730845411810028, "learning_rate": 2e-05, "loss": 0.2816, "step": 115 }, { "epoch": 0.38338658146964855, "grad_norm": 5.819780875953061, "learning_rate": 2e-05, "loss": 0.2438, "step": 120 }, { "epoch": 0.3993610223642173, "grad_norm": 5.818771077307558, "learning_rate": 2e-05, "loss": 0.2764, "step": 125 }, { "epoch": 0.41533546325878595, "grad_norm": 5.674449251632924, "learning_rate": 2e-05, "loss": 0.2679, "step": 130 }, { "epoch": 0.43130990415335463, "grad_norm": 3.5139138000890564, "learning_rate": 2e-05, "loss": 0.266, "step": 135 }, { "epoch": 0.4472843450479233, "grad_norm": 3.6050594093343644, "learning_rate": 2e-05, "loss": 0.2558, "step": 140 }, { "epoch": 0.46325878594249204, "grad_norm": 3.7736226262761248, "learning_rate": 2e-05, "loss": 0.2747, "step": 145 }, { "epoch": 0.4792332268370607, "grad_norm": 3.3294463018044382, "learning_rate": 2e-05, "loss": 0.2124, "step": 150 }, { "epoch": 0.4952076677316294, "grad_norm": 3.978340934287849, "learning_rate": 2e-05, "loss": 0.2626, "step": 155 }, { "epoch": 0.5111821086261981, "grad_norm": 3.7733916384693997, "learning_rate": 2e-05, "loss": 0.3012, "step": 160 }, { "epoch": 0.5271565495207667, "grad_norm": 2.475405136211538, "learning_rate": 2e-05, "loss": 0.2506, "step": 165 }, { "epoch": 0.5431309904153354, "grad_norm": 2.623200763225571, "learning_rate": 2e-05, "loss": 0.2127, "step": 170 }, { "epoch": 0.5591054313099042, "grad_norm": 3.1075207472955797, "learning_rate": 2e-05, "loss": 0.2441, "step": 175 }, { "epoch": 0.5750798722044729, "grad_norm": 2.446477613149001, "learning_rate": 2e-05, "loss": 0.2124, "step": 180 }, { "epoch": 0.5910543130990416, "grad_norm": 4.2022279283216495, "learning_rate": 2e-05, "loss": 0.24, "step": 185 }, { "epoch": 0.6070287539936102, "grad_norm": 3.527771879306774, "learning_rate": 2e-05, "loss": 0.2458, "step": 190 }, { "epoch": 0.6230031948881789, "grad_norm": 3.5313927317162133, "learning_rate": 2e-05, "loss": 0.2714, "step": 195 }, { "epoch": 0.6389776357827476, "grad_norm": 3.6235305866137546, "learning_rate": 2e-05, "loss": 0.2653, "step": 200 }, { "epoch": 0.6549520766773163, "grad_norm": 4.876371447504886, "learning_rate": 2e-05, "loss": 0.2373, "step": 205 }, { "epoch": 0.670926517571885, "grad_norm": 3.5358993905726868, "learning_rate": 2e-05, "loss": 0.2205, "step": 210 }, { "epoch": 0.6869009584664537, "grad_norm": 2.4600844043540127, "learning_rate": 2e-05, "loss": 0.205, "step": 215 }, { "epoch": 0.7028753993610224, "grad_norm": 4.689947740869789, "learning_rate": 2e-05, "loss": 0.2497, "step": 220 }, { "epoch": 0.7188498402555911, "grad_norm": 3.8186352734247073, "learning_rate": 2e-05, "loss": 0.2624, "step": 225 }, { "epoch": 0.7348242811501597, "grad_norm": 4.186654907595584, "learning_rate": 2e-05, "loss": 0.2046, "step": 230 }, { "epoch": 0.7507987220447284, "grad_norm": 4.618434453667313, "learning_rate": 2e-05, "loss": 0.2297, "step": 235 }, { "epoch": 0.7667731629392971, "grad_norm": 1.6540359321412514, "learning_rate": 2e-05, "loss": 0.1976, "step": 240 }, { "epoch": 0.7827476038338658, "grad_norm": 2.966359474906274, "learning_rate": 2e-05, "loss": 0.2267, "step": 245 }, { "epoch": 0.7987220447284346, "grad_norm": 3.178498309301471, "learning_rate": 2e-05, "loss": 0.2015, "step": 250 }, { "epoch": 0.8146964856230032, "grad_norm": 3.0943406181806066, "learning_rate": 2e-05, "loss": 0.2088, "step": 255 }, { "epoch": 0.8306709265175719, "grad_norm": 2.601647495877313, "learning_rate": 2e-05, "loss": 0.1997, "step": 260 }, { "epoch": 0.8466453674121406, "grad_norm": 2.74734218285866, "learning_rate": 2e-05, "loss": 0.2271, "step": 265 }, { "epoch": 0.8626198083067093, "grad_norm": 4.600055126522387, "learning_rate": 2e-05, "loss": 0.2188, "step": 270 }, { "epoch": 0.8785942492012779, "grad_norm": 2.854778230115055, "learning_rate": 2e-05, "loss": 0.2136, "step": 275 }, { "epoch": 0.8945686900958466, "grad_norm": 5.6767551180163185, "learning_rate": 2e-05, "loss": 0.2362, "step": 280 }, { "epoch": 0.9105431309904153, "grad_norm": 2.4685062213282705, "learning_rate": 2e-05, "loss": 0.2108, "step": 285 }, { "epoch": 0.9265175718849841, "grad_norm": 4.1197310782397, "learning_rate": 2e-05, "loss": 0.2084, "step": 290 }, { "epoch": 0.9424920127795527, "grad_norm": 3.4714190539955085, "learning_rate": 2e-05, "loss": 0.2327, "step": 295 }, { "epoch": 0.9584664536741214, "grad_norm": 2.7324693594411613, "learning_rate": 2e-05, "loss": 0.2264, "step": 300 }, { "epoch": 0.9744408945686901, "grad_norm": 3.421741611446172, "learning_rate": 2e-05, "loss": 0.1995, "step": 305 }, { "epoch": 0.9904153354632588, "grad_norm": 2.9392575520935753, "learning_rate": 2e-05, "loss": 0.2168, "step": 310 } ], "logging_steps": 5, "max_steps": 626, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 313, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4095989514240.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }