{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9528795811518327, "eval_steps": 500, "global_step": 141, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.041884816753926704, "grad_norm": 0.21733999252319336, "learning_rate": 1.3333333333333333e-05, "loss": 1.0015, "step": 2 }, { "epoch": 0.08376963350785341, "grad_norm": 0.21422453224658966, "learning_rate": 2.6666666666666667e-05, "loss": 0.9283, "step": 4 }, { "epoch": 0.1256544502617801, "grad_norm": 0.18293505907058716, "learning_rate": 4e-05, "loss": 0.8643, "step": 6 }, { "epoch": 0.16753926701570682, "grad_norm": 0.2287076711654663, "learning_rate": 5.333333333333333e-05, "loss": 1.0013, "step": 8 }, { "epoch": 0.2094240837696335, "grad_norm": 0.2509159445762634, "learning_rate": 6.666666666666667e-05, "loss": 0.9341, "step": 10 }, { "epoch": 0.2513089005235602, "grad_norm": 0.21400025486946106, "learning_rate": 8e-05, "loss": 0.9068, "step": 12 }, { "epoch": 0.2931937172774869, "grad_norm": 0.20343424379825592, "learning_rate": 9.333333333333334e-05, "loss": 0.8083, "step": 14 }, { "epoch": 0.33507853403141363, "grad_norm": 0.25129085779190063, "learning_rate": 9.998445910004082e-05, "loss": 0.8985, "step": 16 }, { "epoch": 0.3769633507853403, "grad_norm": 0.28799620270729065, "learning_rate": 9.986018985905901e-05, "loss": 0.9362, "step": 18 }, { "epoch": 0.418848167539267, "grad_norm": 0.29551103711128235, "learning_rate": 9.961196033000861e-05, "loss": 0.9174, "step": 20 }, { "epoch": 0.4607329842931937, "grad_norm": 0.31457847356796265, "learning_rate": 9.924038765061042e-05, "loss": 0.8732, "step": 22 }, { "epoch": 0.5026178010471204, "grad_norm": 0.2715758979320526, "learning_rate": 9.874639560909117e-05, "loss": 0.8653, "step": 24 }, { "epoch": 0.5445026178010471, "grad_norm": 0.25824904441833496, "learning_rate": 9.81312123475006e-05, "loss": 0.8592, "step": 26 }, { "epoch": 0.5863874345549738, "grad_norm": 0.22251802682876587, "learning_rate": 9.73963673083566e-05, "loss": 0.805, "step": 28 }, { "epoch": 0.6282722513089005, "grad_norm": 0.22290602326393127, "learning_rate": 9.654368743221022e-05, "loss": 0.7464, "step": 30 }, { "epoch": 0.6701570680628273, "grad_norm": 0.2132934331893921, "learning_rate": 9.557529261558367e-05, "loss": 0.7796, "step": 32 }, { "epoch": 0.7120418848167539, "grad_norm": 0.19897978007793427, "learning_rate": 9.449359044057345e-05, "loss": 0.7768, "step": 34 }, { "epoch": 0.7539267015706806, "grad_norm": 0.21434754133224487, "learning_rate": 9.330127018922194e-05, "loss": 0.8751, "step": 36 }, { "epoch": 0.7958115183246073, "grad_norm": 0.2214685082435608, "learning_rate": 9.200129615753859e-05, "loss": 0.7966, "step": 38 }, { "epoch": 0.837696335078534, "grad_norm": 0.20432740449905396, "learning_rate": 9.059690028579284e-05, "loss": 0.7963, "step": 40 }, { "epoch": 0.8795811518324608, "grad_norm": 0.23610389232635498, "learning_rate": 8.90915741234015e-05, "loss": 0.8332, "step": 42 }, { "epoch": 0.9214659685863874, "grad_norm": 0.23847244679927826, "learning_rate": 8.748906014838672e-05, "loss": 0.7869, "step": 44 }, { "epoch": 0.9633507853403142, "grad_norm": 0.23536360263824463, "learning_rate": 8.579334246298593e-05, "loss": 0.8517, "step": 46 }, { "epoch": 1.0052356020942408, "grad_norm": 0.24705368280410767, "learning_rate": 8.400863688854597e-05, "loss": 0.8694, "step": 48 }, { "epoch": 1.0471204188481675, "grad_norm": 0.22067400813102722, "learning_rate": 8.213938048432697e-05, "loss": 0.8002, "step": 50 }, { "epoch": 1.0890052356020943, "grad_norm": 0.21238695085048676, "learning_rate": 8.019022051627388e-05, "loss": 0.7798, "step": 52 }, { "epoch": 1.130890052356021, "grad_norm": 0.2280968427658081, "learning_rate": 7.81660029031811e-05, "loss": 0.8114, "step": 54 }, { "epoch": 1.1727748691099475, "grad_norm": 0.2327934205532074, "learning_rate": 7.60717601689749e-05, "loss": 0.8206, "step": 56 }, { "epoch": 1.2146596858638743, "grad_norm": 0.2437446266412735, "learning_rate": 7.391269893106592e-05, "loss": 0.8477, "step": 58 }, { "epoch": 1.256544502617801, "grad_norm": 0.2429206371307373, "learning_rate": 7.169418695587791e-05, "loss": 0.8213, "step": 60 }, { "epoch": 1.2984293193717278, "grad_norm": 0.23616257309913635, "learning_rate": 6.942173981373474e-05, "loss": 0.8287, "step": 62 }, { "epoch": 1.3403141361256545, "grad_norm": 0.2597900629043579, "learning_rate": 6.710100716628344e-05, "loss": 0.7643, "step": 64 }, { "epoch": 1.3821989528795813, "grad_norm": 0.2668094038963318, "learning_rate": 6.473775872054521e-05, "loss": 0.7315, "step": 66 }, { "epoch": 1.4240837696335078, "grad_norm": 0.2583785355091095, "learning_rate": 6.233786988451468e-05, "loss": 0.7811, "step": 68 }, { "epoch": 1.4659685863874345, "grad_norm": 0.2756960391998291, "learning_rate": 5.9907307159969884e-05, "loss": 0.754, "step": 70 }, { "epoch": 1.5078534031413613, "grad_norm": 0.28554120659828186, "learning_rate": 5.745211330880872e-05, "loss": 0.7809, "step": 72 }, { "epoch": 1.5497382198952878, "grad_norm": 0.3146248459815979, "learning_rate": 5.497839232979084e-05, "loss": 0.8319, "step": 74 }, { "epoch": 1.5916230366492146, "grad_norm": 0.2715282142162323, "learning_rate": 5.249229428303486e-05, "loss": 0.7745, "step": 76 }, { "epoch": 1.6335078534031413, "grad_norm": 0.28730642795562744, "learning_rate": 5e-05, "loss": 0.8002, "step": 78 }, { "epoch": 1.675392670157068, "grad_norm": 0.3208574652671814, "learning_rate": 4.750770571696514e-05, "loss": 0.7718, "step": 80 }, { "epoch": 1.7172774869109948, "grad_norm": 0.307959645986557, "learning_rate": 4.502160767020918e-05, "loss": 0.742, "step": 82 }, { "epoch": 1.7591623036649215, "grad_norm": 0.34215685725212097, "learning_rate": 4.254788669119127e-05, "loss": 0.7687, "step": 84 }, { "epoch": 1.8010471204188483, "grad_norm": 0.28537240624427795, "learning_rate": 4.0092692840030134e-05, "loss": 0.7066, "step": 86 }, { "epoch": 1.8429319371727748, "grad_norm": 0.31295880675315857, "learning_rate": 3.7662130115485314e-05, "loss": 0.7098, "step": 88 }, { "epoch": 1.8848167539267016, "grad_norm": 0.2994559705257416, "learning_rate": 3.5262241279454785e-05, "loss": 0.6961, "step": 90 }, { "epoch": 1.9267015706806283, "grad_norm": 0.3853859603404999, "learning_rate": 3.289899283371657e-05, "loss": 0.7948, "step": 92 }, { "epoch": 1.9685863874345548, "grad_norm": 0.3592422306537628, "learning_rate": 3.0578260186265265e-05, "loss": 0.7264, "step": 94 }, { "epoch": 2.0104712041884816, "grad_norm": 0.42988094687461853, "learning_rate": 2.8305813044122097e-05, "loss": 0.6696, "step": 96 }, { "epoch": 2.0523560209424083, "grad_norm": 0.3221912086009979, "learning_rate": 2.6087301068934106e-05, "loss": 0.7135, "step": 98 }, { "epoch": 2.094240837696335, "grad_norm": 0.3392656743526459, "learning_rate": 2.39282398310251e-05, "loss": 0.7865, "step": 100 }, { "epoch": 2.136125654450262, "grad_norm": 0.32586315274238586, "learning_rate": 2.1833997096818898e-05, "loss": 0.7411, "step": 102 }, { "epoch": 2.1780104712041886, "grad_norm": 0.36464083194732666, "learning_rate": 1.980977948372612e-05, "loss": 0.772, "step": 104 }, { "epoch": 2.2198952879581153, "grad_norm": 0.3270440697669983, "learning_rate": 1.7860619515673033e-05, "loss": 0.7096, "step": 106 }, { "epoch": 2.261780104712042, "grad_norm": 0.3391498923301697, "learning_rate": 1.599136311145402e-05, "loss": 0.7378, "step": 108 }, { "epoch": 2.303664921465969, "grad_norm": 0.3948444128036499, "learning_rate": 1.4206657537014079e-05, "loss": 0.6471, "step": 110 }, { "epoch": 2.345549738219895, "grad_norm": 0.38464757800102234, "learning_rate": 1.2510939851613285e-05, "loss": 0.7359, "step": 112 }, { "epoch": 2.387434554973822, "grad_norm": 0.38830411434173584, "learning_rate": 1.090842587659851e-05, "loss": 0.7572, "step": 114 }, { "epoch": 2.4293193717277486, "grad_norm": 0.37749606370925903, "learning_rate": 9.403099714207175e-06, "loss": 0.7411, "step": 116 }, { "epoch": 2.4712041884816753, "grad_norm": 0.3769814968109131, "learning_rate": 7.998703842461431e-06, "loss": 0.6692, "step": 118 }, { "epoch": 2.513089005235602, "grad_norm": 0.4309244453907013, "learning_rate": 6.698729810778065e-06, "loss": 0.7619, "step": 120 }, { "epoch": 2.554973821989529, "grad_norm": 0.37709513306617737, "learning_rate": 5.506409559426573e-06, "loss": 0.708, "step": 122 }, { "epoch": 2.5968586387434556, "grad_norm": 0.40647101402282715, "learning_rate": 4.424707384416344e-06, "loss": 0.7279, "step": 124 }, { "epoch": 2.6387434554973823, "grad_norm": 0.3731394112110138, "learning_rate": 3.4563125677897932e-06, "loss": 0.7629, "step": 126 }, { "epoch": 2.680628272251309, "grad_norm": 0.40869101881980896, "learning_rate": 2.603632691643415e-06, "loss": 0.7334, "step": 128 }, { "epoch": 2.7225130890052354, "grad_norm": 0.33611902594566345, "learning_rate": 1.8687876524993987e-06, "loss": 0.6643, "step": 130 }, { "epoch": 2.7643979057591626, "grad_norm": 0.38377344608306885, "learning_rate": 1.2536043909088191e-06, "loss": 0.7363, "step": 132 }, { "epoch": 2.806282722513089, "grad_norm": 0.38260236382484436, "learning_rate": 7.596123493895991e-07, "loss": 0.7018, "step": 134 }, { "epoch": 2.8481675392670156, "grad_norm": 0.3747893273830414, "learning_rate": 3.8803966999139684e-07, "loss": 0.8282, "step": 136 }, { "epoch": 2.8900523560209423, "grad_norm": 0.40073326230049133, "learning_rate": 1.3981014094099353e-07, "loss": 0.6387, "step": 138 }, { "epoch": 2.931937172774869, "grad_norm": 0.4021676480770111, "learning_rate": 1.5540899959187727e-08, "loss": 0.7076, "step": 140 }, { "epoch": 2.9528795811518327, "step": 141, "total_flos": 5.744674604829082e+16, "train_loss": 0.7884446775659602, "train_runtime": 510.788, "train_samples_per_second": 8.957, "train_steps_per_second": 0.276 } ], "logging_steps": 2, "max_steps": 141, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.744674604829082e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }