{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.104816696762272, "global_step": 193000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21, "learning_rate": 0.0002955626928115623, "loss": 1.1705, "step": 5000 }, { "epoch": 0.21, "eval_loss": 0.22145646810531616, "eval_runtime": 5082.7114, "eval_samples_per_second": 4.02, "step": 5000 }, { "epoch": 0.42, "learning_rate": 0.0002892236825423657, "loss": 0.2914, "step": 10000 }, { "epoch": 0.42, "eval_loss": 0.19062571227550507, "eval_runtime": 5086.3406, "eval_samples_per_second": 4.018, "step": 10000 }, { "epoch": 0.63, "learning_rate": 0.00028288467227316906, "loss": 0.2617, "step": 15000 }, { "epoch": 0.63, "eval_loss": 0.1784171611070633, "eval_runtime": 4895.8348, "eval_samples_per_second": 4.174, "step": 15000 }, { "epoch": 0.84, "learning_rate": 0.0002765456620039724, "loss": 0.2449, "step": 20000 }, { "epoch": 0.84, "eval_loss": 0.16641439497470856, "eval_runtime": 4945.3495, "eval_samples_per_second": 4.132, "step": 20000 }, { "epoch": 1.05, "learning_rate": 0.0002702066517347758, "loss": 0.2264, "step": 25000 }, { "epoch": 1.05, "eval_loss": 0.15650227665901184, "eval_runtime": 4999.732, "eval_samples_per_second": 4.087, "step": 25000 }, { "epoch": 1.26, "learning_rate": 0.00026386764146557915, "loss": 0.2093, "step": 30000 }, { "epoch": 1.26, "eval_loss": 0.15184776484966278, "eval_runtime": 4877.0685, "eval_samples_per_second": 4.19, "step": 30000 }, { "epoch": 1.47, "learning_rate": 0.0002575286311963825, "loss": 0.2003, "step": 35000 }, { "epoch": 1.47, "eval_loss": 0.14190027117729187, "eval_runtime": 4879.5538, "eval_samples_per_second": 4.188, "step": 35000 }, { "epoch": 1.68, "learning_rate": 0.0002511896209271859, "loss": 0.1962, "step": 40000 }, { "epoch": 1.68, "eval_loss": 0.14596430957317352, "eval_runtime": 4860.8654, "eval_samples_per_second": 4.204, "step": 40000 }, { "epoch": 1.89, "learning_rate": 0.00024485061065798925, "loss": 0.19, "step": 45000 }, { "epoch": 1.89, "eval_loss": 0.13479308784008026, "eval_runtime": 4872.4496, "eval_samples_per_second": 4.194, "step": 45000 }, { "epoch": 2.1, "learning_rate": 0.00023851160038879262, "loss": 0.1769, "step": 50000 }, { "epoch": 2.1, "eval_loss": 0.13018357753753662, "eval_runtime": 4872.3296, "eval_samples_per_second": 4.194, "step": 50000 }, { "epoch": 2.31, "learning_rate": 0.00023217259011959596, "loss": 0.1674, "step": 55000 }, { "epoch": 2.31, "eval_loss": 0.1303720772266388, "eval_runtime": 4874.4649, "eval_samples_per_second": 4.192, "step": 55000 }, { "epoch": 2.52, "learning_rate": 0.00022583357985039935, "loss": 0.1655, "step": 60000 }, { "epoch": 2.52, "eval_loss": 0.12317115068435669, "eval_runtime": 4882.2049, "eval_samples_per_second": 4.186, "step": 60000 }, { "epoch": 2.73, "learning_rate": 0.00021949456958120271, "loss": 0.1608, "step": 65000 }, { "epoch": 2.73, "eval_loss": 0.12056649476289749, "eval_runtime": 4877.3831, "eval_samples_per_second": 4.19, "step": 65000 }, { "epoch": 2.94, "learning_rate": 0.00021315555931200605, "loss": 0.1565, "step": 70000 }, { "epoch": 2.94, "eval_loss": 0.11486475169658661, "eval_runtime": 4876.7317, "eval_samples_per_second": 4.19, "step": 70000 }, { "epoch": 3.15, "learning_rate": 0.00020681654904280945, "loss": 0.146, "step": 75000 }, { "epoch": 3.15, "eval_loss": 0.11829441785812378, "eval_runtime": 4833.5153, "eval_samples_per_second": 4.228, "step": 75000 }, { "epoch": 3.36, "learning_rate": 0.00020047753877361279, "loss": 0.1403, "step": 80000 }, { "epoch": 3.36, "eval_loss": 0.11143175512552261, "eval_runtime": 4853.9814, "eval_samples_per_second": 4.21, "step": 80000 }, { "epoch": 3.57, "learning_rate": 0.00019413852850441618, "loss": 0.1376, "step": 85000 }, { "epoch": 3.57, "eval_loss": 0.11027190089225769, "eval_runtime": 4858.6043, "eval_samples_per_second": 4.206, "step": 85000 }, { "epoch": 3.78, "learning_rate": 0.00018779951823521952, "loss": 0.1337, "step": 90000 }, { "epoch": 3.78, "eval_loss": 0.10872453451156616, "eval_runtime": 4864.6388, "eval_samples_per_second": 4.201, "step": 90000 }, { "epoch": 3.99, "learning_rate": 0.00018146050796602288, "loss": 0.1325, "step": 95000 }, { "epoch": 3.99, "eval_loss": 0.10718829929828644, "eval_runtime": 4921.9134, "eval_samples_per_second": 4.152, "step": 95000 }, { "epoch": 4.2, "learning_rate": 0.00017512149769682625, "loss": 0.1195, "step": 100000 }, { "epoch": 4.2, "eval_loss": 0.10430513322353363, "eval_runtime": 4864.6263, "eval_samples_per_second": 4.201, "step": 100000 }, { "epoch": 4.41, "learning_rate": 0.00016878248742762961, "loss": 0.118, "step": 105000 }, { "epoch": 4.41, "eval_loss": 0.1070966124534607, "eval_runtime": 4879.2783, "eval_samples_per_second": 4.188, "step": 105000 }, { "epoch": 4.62, "learning_rate": 0.00016244347715843295, "loss": 0.1173, "step": 110000 }, { "epoch": 4.62, "eval_loss": 0.10433077067136765, "eval_runtime": 4876.4984, "eval_samples_per_second": 4.191, "step": 110000 }, { "epoch": 4.83, "learning_rate": 0.00015610446688923635, "loss": 0.115, "step": 115000 }, { "epoch": 4.83, "eval_loss": 0.09682977199554443, "eval_runtime": 4893.4652, "eval_samples_per_second": 4.176, "step": 115000 }, { "epoch": 5.04, "learning_rate": 0.0001497654566200397, "loss": 0.1102, "step": 120000 }, { "epoch": 5.04, "eval_loss": 0.09630288183689117, "eval_runtime": 4914.2049, "eval_samples_per_second": 4.158, "step": 120000 }, { "epoch": 5.25, "learning_rate": 0.00014342644635084308, "loss": 0.1019, "step": 125000 }, { "epoch": 5.25, "eval_loss": 0.0918075293302536, "eval_runtime": 4893.9499, "eval_samples_per_second": 4.176, "step": 125000 }, { "epoch": 5.46, "learning_rate": 0.00013708743608164644, "loss": 0.1014, "step": 130000 }, { "epoch": 5.46, "eval_loss": 0.09067174792289734, "eval_runtime": 4891.4795, "eval_samples_per_second": 4.178, "step": 130000 }, { "epoch": 5.67, "learning_rate": 0.0001307484258124498, "loss": 0.1, "step": 135000 }, { "epoch": 5.67, "eval_loss": 0.08851899951696396, "eval_runtime": 4884.1688, "eval_samples_per_second": 4.184, "step": 135000 }, { "epoch": 5.88, "learning_rate": 0.00012440941554325318, "loss": 0.0971, "step": 140000 }, { "epoch": 5.88, "eval_loss": 0.08720648288726807, "eval_runtime": 4896.4597, "eval_samples_per_second": 4.173, "step": 140000 }, { "epoch": 6.09, "learning_rate": 0.00011807040527405654, "loss": 0.0921, "step": 145000 }, { "epoch": 6.09, "eval_loss": 0.08666499704122543, "eval_runtime": 5117.0953, "eval_samples_per_second": 3.993, "step": 145000 }, { "epoch": 6.3, "learning_rate": 0.00011173139500485991, "loss": 0.0884, "step": 150000 }, { "epoch": 6.3, "eval_loss": 0.0831904485821724, "eval_runtime": 4879.0536, "eval_samples_per_second": 4.188, "step": 150000 }, { "epoch": 6.51, "learning_rate": 0.00010539238473566326, "loss": 0.0864, "step": 155000 }, { "epoch": 6.51, "eval_loss": 0.08337873965501785, "eval_runtime": 4900.7475, "eval_samples_per_second": 4.17, "step": 155000 }, { "epoch": 6.72, "learning_rate": 9.905337446646663e-05, "loss": 0.0861, "step": 160000 }, { "epoch": 6.72, "eval_loss": 0.08155979961156845, "eval_runtime": 4919.8246, "eval_samples_per_second": 4.154, "step": 160000 }, { "epoch": 6.93, "learning_rate": 9.271436419726999e-05, "loss": 0.083, "step": 165000 }, { "epoch": 6.93, "eval_loss": 0.08167865127325058, "eval_runtime": 4921.4971, "eval_samples_per_second": 4.152, "step": 165000 }, { "epoch": 7.14, "learning_rate": 8.637535392807336e-05, "loss": 0.0769, "step": 170000 }, { "epoch": 7.14, "eval_loss": 0.0775604099035263, "eval_runtime": 4893.4221, "eval_samples_per_second": 4.176, "step": 170000 }, { "epoch": 7.35, "learning_rate": 8.003634365887672e-05, "loss": 0.0749, "step": 175000 }, { "epoch": 7.35, "eval_loss": 0.07773936539888382, "eval_runtime": 4914.0163, "eval_samples_per_second": 4.159, "step": 175000 }, { "epoch": 7.56, "learning_rate": 7.369733338968009e-05, "loss": 0.0735, "step": 180000 }, { "epoch": 7.56, "eval_loss": 0.07420430332422256, "eval_runtime": 4934.0827, "eval_samples_per_second": 4.142, "step": 180000 }, { "epoch": 7.77, "learning_rate": 6.735832312048346e-05, "loss": 0.0715, "step": 185000 }, { "epoch": 7.77, "eval_loss": 0.07269106060266495, "eval_runtime": 4925.1046, "eval_samples_per_second": 4.149, "step": 185000 }, { "epoch": 7.98, "learning_rate": 6.1019312851286814e-05, "loss": 0.0702, "step": 190000 }, { "epoch": 7.98, "eval_loss": 0.07183075696229935, "eval_runtime": 4948.9933, "eval_samples_per_second": 4.129, "step": 190000 } ], "max_steps": 238130, "num_train_epochs": 10, "total_flos": 5.387421756388246e+20, "trial_name": null, "trial_params": null }