{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.2623244524002075, "learning_rate": 0.00019994965423831854, "loss": 1.4022, "step": 1000 }, { "epoch": 0.02, "eval_loss": 1.4888113737106323, "eval_runtime": 118.6929, "eval_samples_per_second": 53.415, "eval_steps_per_second": 13.354, "step": 1000 }, { "epoch": 0.04, "grad_norm": 1.0886244773864746, "learning_rate": 0.00019954719225730847, "loss": 1.39, "step": 2000 }, { "epoch": 0.04, "eval_loss": 1.4779928922653198, "eval_runtime": 119.2893, "eval_samples_per_second": 53.148, "eval_steps_per_second": 13.287, "step": 2000 }, { "epoch": 0.06, "grad_norm": 1.0032905340194702, "learning_rate": 0.00019874388886763944, "loss": 1.3591, "step": 3000 }, { "epoch": 0.06, "eval_loss": 1.4647990465164185, "eval_runtime": 119.1474, "eval_samples_per_second": 53.211, "eval_steps_per_second": 13.303, "step": 3000 }, { "epoch": 0.08, "grad_norm": 1.1680448055267334, "learning_rate": 0.00019754297868854073, "loss": 1.3373, "step": 4000 }, { "epoch": 0.08, "eval_loss": 1.4628264904022217, "eval_runtime": 118.5565, "eval_samples_per_second": 53.477, "eval_steps_per_second": 13.369, "step": 4000 }, { "epoch": 0.1, "grad_norm": 1.1541577577590942, "learning_rate": 0.00019594929736144976, "loss": 1.3124, "step": 5000 }, { "epoch": 0.1, "eval_loss": 1.4525669813156128, "eval_runtime": 119.2106, "eval_samples_per_second": 53.183, "eval_steps_per_second": 13.296, "step": 5000 }, { "epoch": 0.12, "grad_norm": 1.252061367034912, "learning_rate": 0.00019396926207859084, "loss": 1.3781, "step": 6000 }, { "epoch": 0.12, "eval_loss": 1.4199129343032837, "eval_runtime": 119.1684, "eval_samples_per_second": 53.202, "eval_steps_per_second": 13.301, "step": 6000 }, { "epoch": 0.14, "grad_norm": 1.1096590757369995, "learning_rate": 0.00019161084574320696, "loss": 1.4167, "step": 7000 }, { "epoch": 0.14, "eval_loss": 1.399798035621643, "eval_runtime": 117.7451, "eval_samples_per_second": 53.845, "eval_steps_per_second": 13.461, "step": 7000 }, { "epoch": 0.16, "grad_norm": 1.014760971069336, "learning_rate": 0.00018888354486549237, "loss": 1.4106, "step": 8000 }, { "epoch": 0.16, "eval_loss": 1.3875452280044556, "eval_runtime": 117.8417, "eval_samples_per_second": 53.801, "eval_steps_per_second": 13.45, "step": 8000 }, { "epoch": 0.18, "grad_norm": 0.991398274898529, "learning_rate": 0.00018579834132349772, "loss": 1.3985, "step": 9000 }, { "epoch": 0.18, "eval_loss": 1.3735474348068237, "eval_runtime": 118.1423, "eval_samples_per_second": 53.664, "eval_steps_per_second": 13.416, "step": 9000 }, { "epoch": 0.2, "grad_norm": 0.9979888796806335, "learning_rate": 0.0001823676581429833, "loss": 1.3924, "step": 10000 }, { "epoch": 0.2, "eval_loss": 1.3612616062164307, "eval_runtime": 119.0319, "eval_samples_per_second": 53.263, "eval_steps_per_second": 13.316, "step": 10000 }, { "epoch": 0.22, "grad_norm": 1.1004694700241089, "learning_rate": 0.00017860530947427875, "loss": 1.3758, "step": 11000 }, { "epoch": 0.22, "eval_loss": 1.3603907823562622, "eval_runtime": 119.1776, "eval_samples_per_second": 53.198, "eval_steps_per_second": 13.299, "step": 11000 }, { "epoch": 0.24, "grad_norm": 1.1753897666931152, "learning_rate": 0.0001745264449675755, "loss": 1.3609, "step": 12000 }, { "epoch": 0.24, "eval_loss": 1.3465052843093872, "eval_runtime": 118.067, "eval_samples_per_second": 53.698, "eval_steps_per_second": 13.425, "step": 12000 }, { "epoch": 0.26, "grad_norm": 1.2619237899780273, "learning_rate": 0.00017014748877063214, "loss": 1.344, "step": 13000 }, { "epoch": 0.26, "eval_loss": 1.3324896097183228, "eval_runtime": 118.0485, "eval_samples_per_second": 53.707, "eval_steps_per_second": 13.427, "step": 13000 }, { "epoch": 0.28, "grad_norm": 0.9625117778778076, "learning_rate": 0.00016548607339452853, "loss": 1.3335, "step": 14000 }, { "epoch": 0.28, "eval_loss": 1.323933482170105, "eval_runtime": 117.505, "eval_samples_per_second": 53.955, "eval_steps_per_second": 13.489, "step": 14000 }, { "epoch": 0.3, "grad_norm": 1.3606470823287964, "learning_rate": 0.00016056096871376667, "loss": 1.3241, "step": 15000 }, { "epoch": 0.3, "eval_loss": 1.314150333404541, "eval_runtime": 117.8746, "eval_samples_per_second": 53.786, "eval_steps_per_second": 13.446, "step": 15000 }, { "epoch": 0.32, "grad_norm": 0.9183652400970459, "learning_rate": 0.00015539200638661104, "loss": 1.3029, "step": 16000 }, { "epoch": 0.32, "eval_loss": 1.3047397136688232, "eval_runtime": 119.2254, "eval_samples_per_second": 53.177, "eval_steps_per_second": 13.294, "step": 16000 }, { "epoch": 0.34, "grad_norm": 0.9226890206336975, "learning_rate": 0.00015000000000000001, "loss": 1.3005, "step": 17000 }, { "epoch": 0.34, "eval_loss": 1.297969102859497, "eval_runtime": 119.8633, "eval_samples_per_second": 52.894, "eval_steps_per_second": 13.223, "step": 17000 }, { "epoch": 0.36, "grad_norm": 0.9322838187217712, "learning_rate": 0.00014440666126057744, "loss": 1.2951, "step": 18000 }, { "epoch": 0.36, "eval_loss": 1.2912102937698364, "eval_runtime": 119.9122, "eval_samples_per_second": 52.872, "eval_steps_per_second": 13.218, "step": 18000 }, { "epoch": 0.38, "grad_norm": 1.102362871170044, "learning_rate": 0.00013863451256931287, "loss": 1.313, "step": 19000 }, { "epoch": 0.38, "eval_loss": 1.282011866569519, "eval_runtime": 117.9359, "eval_samples_per_second": 53.758, "eval_steps_per_second": 13.44, "step": 19000 }, { "epoch": 0.4, "grad_norm": 1.0014485120773315, "learning_rate": 0.00013270679633174218, "loss": 1.2773, "step": 20000 }, { "epoch": 0.4, "eval_loss": 1.2778606414794922, "eval_runtime": 120.0646, "eval_samples_per_second": 52.805, "eval_steps_per_second": 13.201, "step": 20000 }, { "epoch": 0.42, "grad_norm": 0.8490633964538574, "learning_rate": 0.00012664738136900348, "loss": 1.2734, "step": 21000 }, { "epoch": 0.42, "eval_loss": 1.2667981386184692, "eval_runtime": 117.9405, "eval_samples_per_second": 53.756, "eval_steps_per_second": 13.439, "step": 21000 }, { "epoch": 0.44, "grad_norm": 0.9456785321235657, "learning_rate": 0.00012048066680651908, "loss": 1.2656, "step": 22000 }, { "epoch": 0.44, "eval_loss": 1.258193850517273, "eval_runtime": 121.0875, "eval_samples_per_second": 52.359, "eval_steps_per_second": 13.09, "step": 22000 }, { "epoch": 0.46, "grad_norm": 1.1480209827423096, "learning_rate": 0.00011423148382732853, "loss": 1.2522, "step": 23000 }, { "epoch": 0.46, "eval_loss": 1.2538591623306274, "eval_runtime": 118.1025, "eval_samples_per_second": 53.682, "eval_steps_per_second": 13.421, "step": 23000 }, { "epoch": 0.48, "grad_norm": 0.8957574963569641, "learning_rate": 0.00010792499568567884, "loss": 1.2519, "step": 24000 }, { "epoch": 0.48, "eval_loss": 1.2467232942581177, "eval_runtime": 118.0714, "eval_samples_per_second": 53.696, "eval_steps_per_second": 13.424, "step": 24000 }, { "epoch": 0.5, "grad_norm": 1.2852896451950073, "learning_rate": 0.00010158659638348081, "loss": 1.24, "step": 25000 }, { "epoch": 0.5, "eval_loss": 1.2399760484695435, "eval_runtime": 118.556, "eval_samples_per_second": 53.477, "eval_steps_per_second": 13.369, "step": 25000 }, { "epoch": 0.52, "grad_norm": 0.9100021719932556, "learning_rate": 9.524180841762577e-05, "loss": 1.2653, "step": 26000 }, { "epoch": 0.52, "eval_loss": 1.2347520589828491, "eval_runtime": 118.0581, "eval_samples_per_second": 53.702, "eval_steps_per_second": 13.426, "step": 26000 }, { "epoch": 0.54, "grad_norm": 1.3319469690322876, "learning_rate": 8.891618000989891e-05, "loss": 1.2313, "step": 27000 }, { "epoch": 0.54, "eval_loss": 1.2284266948699951, "eval_runtime": 118.5886, "eval_samples_per_second": 53.462, "eval_steps_per_second": 13.366, "step": 27000 }, { "epoch": 0.56, "grad_norm": 1.1901592016220093, "learning_rate": 8.263518223330697e-05, "loss": 1.2218, "step": 28000 }, { "epoch": 0.56, "eval_loss": 1.2233468294143677, "eval_runtime": 118.009, "eval_samples_per_second": 53.725, "eval_steps_per_second": 13.431, "step": 28000 }, { "epoch": 0.58, "grad_norm": 1.026314616203308, "learning_rate": 7.642410644905726e-05, "loss": 1.2275, "step": 29000 }, { "epoch": 0.58, "eval_loss": 1.2184290885925293, "eval_runtime": 119.1308, "eval_samples_per_second": 53.219, "eval_steps_per_second": 13.305, "step": 29000 }, { "epoch": 0.6, "grad_norm": 1.3237501382827759, "learning_rate": 7.030796246717255e-05, "loss": 1.2395, "step": 30000 }, { "epoch": 0.6, "eval_loss": 1.213285207748413, "eval_runtime": 118.8571, "eval_samples_per_second": 53.341, "eval_steps_per_second": 13.335, "step": 30000 }, { "epoch": 0.62, "grad_norm": 1.1643166542053223, "learning_rate": 6.431137784081282e-05, "loss": 1.2064, "step": 31000 }, { "epoch": 0.62, "eval_loss": 1.2104227542877197, "eval_runtime": 120.4291, "eval_samples_per_second": 52.645, "eval_steps_per_second": 13.161, "step": 31000 }, { "epoch": 0.64, "grad_norm": 0.7978019118309021, "learning_rate": 5.845849869981137e-05, "loss": 1.2141, "step": 32000 }, { "epoch": 0.64, "eval_loss": 1.2048578262329102, "eval_runtime": 119.224, "eval_samples_per_second": 53.177, "eval_steps_per_second": 13.294, "step": 32000 }, { "epoch": 0.66, "grad_norm": 1.1302543878555298, "learning_rate": 5.277289252273174e-05, "loss": 1.2054, "step": 33000 }, { "epoch": 0.66, "eval_loss": 1.2011561393737793, "eval_runtime": 118.3268, "eval_samples_per_second": 53.58, "eval_steps_per_second": 13.395, "step": 33000 }, { "epoch": 0.68, "grad_norm": 1.2881643772125244, "learning_rate": 4.727745323894976e-05, "loss": 1.2136, "step": 34000 }, { "epoch": 0.68, "eval_loss": 1.1976003646850586, "eval_runtime": 118.2934, "eval_samples_per_second": 53.596, "eval_steps_per_second": 13.399, "step": 34000 }, { "epoch": 0.7, "grad_norm": 0.8197779655456543, "learning_rate": 4.19943090428802e-05, "loss": 1.1883, "step": 35000 }, { "epoch": 0.7, "eval_loss": 1.1930813789367676, "eval_runtime": 117.6868, "eval_samples_per_second": 53.872, "eval_steps_per_second": 13.468, "step": 35000 }, { "epoch": 0.72, "grad_norm": 0.9713582992553711, "learning_rate": 3.694473329154778e-05, "loss": 1.2058, "step": 36000 }, { "epoch": 0.72, "eval_loss": 1.1900451183319092, "eval_runtime": 119.439, "eval_samples_per_second": 53.081, "eval_steps_per_second": 13.27, "step": 36000 }, { "epoch": 0.74, "grad_norm": 1.4913629293441772, "learning_rate": 3.21490588442868e-05, "loss": 1.1864, "step": 37000 }, { "epoch": 0.74, "eval_loss": 1.1863234043121338, "eval_runtime": 117.5219, "eval_samples_per_second": 53.947, "eval_steps_per_second": 13.487, "step": 37000 }, { "epoch": 0.76, "grad_norm": 1.2373270988464355, "learning_rate": 2.7626596189492983e-05, "loss": 1.1854, "step": 38000 }, { "epoch": 0.76, "eval_loss": 1.1844661235809326, "eval_runtime": 117.3667, "eval_samples_per_second": 54.019, "eval_steps_per_second": 13.505, "step": 38000 }, { "epoch": 0.78, "grad_norm": 1.1346243619918823, "learning_rate": 2.339555568810221e-05, "loss": 1.1954, "step": 39000 }, { "epoch": 0.78, "eval_loss": 1.1816102266311646, "eval_runtime": 118.4828, "eval_samples_per_second": 53.51, "eval_steps_per_second": 13.377, "step": 39000 }, { "epoch": 0.8, "grad_norm": 1.3110140562057495, "learning_rate": 1.947297424689414e-05, "loss": 1.1663, "step": 40000 }, { "epoch": 0.8, "eval_loss": 1.178844690322876, "eval_runtime": 118.8225, "eval_samples_per_second": 53.357, "eval_steps_per_second": 13.339, "step": 40000 }, { "epoch": 0.82, "grad_norm": 1.2288557291030884, "learning_rate": 1.587464671688187e-05, "loss": 1.1912, "step": 41000 }, { "epoch": 0.82, "eval_loss": 1.177311897277832, "eval_runtime": 119.099, "eval_samples_per_second": 53.233, "eval_steps_per_second": 13.308, "step": 41000 }, { "epoch": 0.84, "grad_norm": 1.0926482677459717, "learning_rate": 1.2615062293021507e-05, "loss": 1.1855, "step": 42000 }, { "epoch": 0.84, "eval_loss": 1.1756287813186646, "eval_runtime": 118.243, "eval_samples_per_second": 53.618, "eval_steps_per_second": 13.405, "step": 42000 }, { "epoch": 0.86, "grad_norm": 1.0959358215332031, "learning_rate": 9.707346171337894e-06, "loss": 1.1773, "step": 43000 }, { "epoch": 0.86, "eval_loss": 1.1744451522827148, "eval_runtime": 118.507, "eval_samples_per_second": 53.499, "eval_steps_per_second": 13.375, "step": 43000 }, { "epoch": 0.88, "grad_norm": 0.8658304810523987, "learning_rate": 7.163206698392744e-06, "loss": 1.1874, "step": 44000 }, { "epoch": 0.88, "eval_loss": 1.1730809211730957, "eval_runtime": 118.6435, "eval_samples_per_second": 53.437, "eval_steps_per_second": 13.359, "step": 44000 }, { "epoch": 0.9, "grad_norm": 1.18503737449646, "learning_rate": 4.992888225905468e-06, "loss": 1.1679, "step": 45000 }, { "epoch": 0.9, "eval_loss": 1.171962857246399, "eval_runtime": 119.251, "eval_samples_per_second": 53.165, "eval_steps_per_second": 13.291, "step": 45000 }, { "epoch": 0.92, "grad_norm": 1.209384560585022, "learning_rate": 3.2051298603643753e-06, "loss": 1.1776, "step": 46000 }, { "epoch": 0.92, "eval_loss": 1.1715712547302246, "eval_runtime": 120.3589, "eval_samples_per_second": 52.676, "eval_steps_per_second": 13.169, "step": 46000 }, { "epoch": 0.94, "grad_norm": 1.157520055770874, "learning_rate": 1.8071302737293295e-06, "loss": 1.1708, "step": 47000 }, { "epoch": 0.94, "eval_loss": 1.1712528467178345, "eval_runtime": 118.7182, "eval_samples_per_second": 53.404, "eval_steps_per_second": 13.351, "step": 47000 }, { "epoch": 0.96, "grad_norm": 1.0842524766921997, "learning_rate": 8.04518716920466e-07, "loss": 1.2052, "step": 48000 }, { "epoch": 0.96, "eval_loss": 1.170965313911438, "eval_runtime": 118.8637, "eval_samples_per_second": 53.338, "eval_steps_per_second": 13.335, "step": 48000 }, { "epoch": 0.98, "grad_norm": 1.1883560419082642, "learning_rate": 2.0133235281156736e-07, "loss": 1.168, "step": 49000 }, { "epoch": 0.98, "eval_loss": 1.1708762645721436, "eval_runtime": 118.2509, "eval_samples_per_second": 53.615, "eval_steps_per_second": 13.404, "step": 49000 }, { "epoch": 1.0, "grad_norm": 1.4387354850769043, "learning_rate": 0.0, "loss": 1.1727, "step": 50000 }, { "epoch": 1.0, "eval_loss": 1.1708616018295288, "eval_runtime": 118.0864, "eval_samples_per_second": 53.689, "eval_steps_per_second": 13.422, "step": 50000 } ], "logging_steps": 1000, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.132424105984e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }