{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007621951219512195, "grad_norm": 4.6875, "learning_rate": 1.5151515151515152e-06, "loss": 1.6918, "step": 10 }, { "epoch": 0.01524390243902439, "grad_norm": 5.0625, "learning_rate": 3.0303030303030305e-06, "loss": 1.6227, "step": 20 }, { "epoch": 0.022865853658536585, "grad_norm": 4.34375, "learning_rate": 4.5454545454545455e-06, "loss": 1.6367, "step": 30 }, { "epoch": 0.03048780487804878, "grad_norm": 4.21875, "learning_rate": 6.060606060606061e-06, "loss": 1.493, "step": 40 }, { "epoch": 0.038109756097560975, "grad_norm": 2.921875, "learning_rate": 7.5757575757575764e-06, "loss": 1.441, "step": 50 }, { "epoch": 0.04573170731707317, "grad_norm": 3.078125, "learning_rate": 9.090909090909091e-06, "loss": 1.3488, "step": 60 }, { "epoch": 0.053353658536585365, "grad_norm": 2.828125, "learning_rate": 1.0606060606060606e-05, "loss": 1.3289, "step": 70 }, { "epoch": 0.06097560975609756, "grad_norm": 2.765625, "learning_rate": 1.2121212121212122e-05, "loss": 1.2742, "step": 80 }, { "epoch": 0.06859756097560976, "grad_norm": 2.890625, "learning_rate": 1.3636363636363637e-05, "loss": 1.2758, "step": 90 }, { "epoch": 0.07621951219512195, "grad_norm": 2.859375, "learning_rate": 1.5151515151515153e-05, "loss": 1.2707, "step": 100 }, { "epoch": 0.08384146341463415, "grad_norm": 2.890625, "learning_rate": 1.6666666666666667e-05, "loss": 1.273, "step": 110 }, { "epoch": 0.09146341463414634, "grad_norm": 2.765625, "learning_rate": 1.8181818181818182e-05, "loss": 1.2414, "step": 120 }, { "epoch": 0.09908536585365854, "grad_norm": 2.75, "learning_rate": 1.96969696969697e-05, "loss": 1.275, "step": 130 }, { "epoch": 0.10670731707317073, "grad_norm": 2.78125, "learning_rate": 1.986440677966102e-05, "loss": 1.2547, "step": 140 }, { "epoch": 0.11432926829268293, "grad_norm": 2.578125, "learning_rate": 1.969491525423729e-05, "loss": 1.2391, "step": 150 }, { "epoch": 0.12195121951219512, "grad_norm": 2.75, "learning_rate": 1.9525423728813562e-05, "loss": 1.2379, "step": 160 }, { "epoch": 0.12957317073170732, "grad_norm": 2.65625, "learning_rate": 1.9355932203389832e-05, "loss": 1.2105, "step": 170 }, { "epoch": 0.13719512195121952, "grad_norm": 2.53125, "learning_rate": 1.9186440677966102e-05, "loss": 1.2359, "step": 180 }, { "epoch": 0.1448170731707317, "grad_norm": 2.75, "learning_rate": 1.9016949152542375e-05, "loss": 1.2867, "step": 190 }, { "epoch": 0.1524390243902439, "grad_norm": 2.8125, "learning_rate": 1.8847457627118645e-05, "loss": 1.1762, "step": 200 }, { "epoch": 0.1600609756097561, "grad_norm": 2.765625, "learning_rate": 1.8677966101694918e-05, "loss": 1.1992, "step": 210 }, { "epoch": 0.1676829268292683, "grad_norm": 2.75, "learning_rate": 1.8508474576271188e-05, "loss": 1.232, "step": 220 }, { "epoch": 0.17530487804878048, "grad_norm": 2.65625, "learning_rate": 1.8338983050847458e-05, "loss": 1.2172, "step": 230 }, { "epoch": 0.18292682926829268, "grad_norm": 2.734375, "learning_rate": 1.816949152542373e-05, "loss": 1.182, "step": 240 }, { "epoch": 0.19054878048780488, "grad_norm": 2.875, "learning_rate": 1.8e-05, "loss": 1.2492, "step": 250 }, { "epoch": 0.19817073170731708, "grad_norm": 2.71875, "learning_rate": 1.7830508474576274e-05, "loss": 1.2312, "step": 260 }, { "epoch": 0.20579268292682926, "grad_norm": 2.65625, "learning_rate": 1.7661016949152543e-05, "loss": 1.241, "step": 270 }, { "epoch": 0.21341463414634146, "grad_norm": 2.78125, "learning_rate": 1.7491525423728813e-05, "loss": 1.1879, "step": 280 }, { "epoch": 0.22103658536585366, "grad_norm": 2.734375, "learning_rate": 1.7322033898305086e-05, "loss": 1.2211, "step": 290 }, { "epoch": 0.22865853658536586, "grad_norm": 2.734375, "learning_rate": 1.715254237288136e-05, "loss": 1.2152, "step": 300 }, { "epoch": 0.23628048780487804, "grad_norm": 2.8125, "learning_rate": 1.698305084745763e-05, "loss": 1.1938, "step": 310 }, { "epoch": 0.24390243902439024, "grad_norm": 2.71875, "learning_rate": 1.68135593220339e-05, "loss": 1.2266, "step": 320 }, { "epoch": 0.25152439024390244, "grad_norm": 2.84375, "learning_rate": 1.6644067796610172e-05, "loss": 1.1672, "step": 330 }, { "epoch": 0.25914634146341464, "grad_norm": 2.71875, "learning_rate": 1.6474576271186442e-05, "loss": 1.2104, "step": 340 }, { "epoch": 0.26676829268292684, "grad_norm": 2.640625, "learning_rate": 1.6305084745762715e-05, "loss": 1.2098, "step": 350 }, { "epoch": 0.27439024390243905, "grad_norm": 2.703125, "learning_rate": 1.6135593220338985e-05, "loss": 1.1684, "step": 360 }, { "epoch": 0.2820121951219512, "grad_norm": 2.765625, "learning_rate": 1.5966101694915255e-05, "loss": 1.1734, "step": 370 }, { "epoch": 0.2896341463414634, "grad_norm": 2.546875, "learning_rate": 1.5796610169491528e-05, "loss": 1.2008, "step": 380 }, { "epoch": 0.2972560975609756, "grad_norm": 2.78125, "learning_rate": 1.5627118644067798e-05, "loss": 1.1848, "step": 390 }, { "epoch": 0.3048780487804878, "grad_norm": 2.625, "learning_rate": 1.545762711864407e-05, "loss": 1.2033, "step": 400 }, { "epoch": 0.3125, "grad_norm": 2.625, "learning_rate": 1.528813559322034e-05, "loss": 1.1781, "step": 410 }, { "epoch": 0.3201219512195122, "grad_norm": 2.96875, "learning_rate": 1.511864406779661e-05, "loss": 1.2043, "step": 420 }, { "epoch": 0.3277439024390244, "grad_norm": 2.453125, "learning_rate": 1.4949152542372882e-05, "loss": 1.1699, "step": 430 }, { "epoch": 0.3353658536585366, "grad_norm": 2.5625, "learning_rate": 1.4779661016949153e-05, "loss": 1.1871, "step": 440 }, { "epoch": 0.3429878048780488, "grad_norm": 2.578125, "learning_rate": 1.4610169491525426e-05, "loss": 1.1418, "step": 450 }, { "epoch": 0.35060975609756095, "grad_norm": 2.640625, "learning_rate": 1.4440677966101698e-05, "loss": 1.1824, "step": 460 }, { "epoch": 0.35823170731707316, "grad_norm": 2.828125, "learning_rate": 1.4271186440677966e-05, "loss": 1.176, "step": 470 }, { "epoch": 0.36585365853658536, "grad_norm": 2.96875, "learning_rate": 1.4101694915254239e-05, "loss": 1.1785, "step": 480 }, { "epoch": 0.37347560975609756, "grad_norm": 2.796875, "learning_rate": 1.393220338983051e-05, "loss": 1.1758, "step": 490 }, { "epoch": 0.38109756097560976, "grad_norm": 2.75, "learning_rate": 1.3762711864406782e-05, "loss": 1.1988, "step": 500 }, { "epoch": 0.38871951219512196, "grad_norm": 2.578125, "learning_rate": 1.3593220338983053e-05, "loss": 1.1822, "step": 510 }, { "epoch": 0.39634146341463417, "grad_norm": 2.953125, "learning_rate": 1.3423728813559323e-05, "loss": 1.1801, "step": 520 }, { "epoch": 0.40396341463414637, "grad_norm": 2.640625, "learning_rate": 1.3254237288135595e-05, "loss": 1.2043, "step": 530 }, { "epoch": 0.4115853658536585, "grad_norm": 2.546875, "learning_rate": 1.3084745762711866e-05, "loss": 1.2098, "step": 540 }, { "epoch": 0.4192073170731707, "grad_norm": 2.625, "learning_rate": 1.2915254237288137e-05, "loss": 1.1973, "step": 550 }, { "epoch": 0.4268292682926829, "grad_norm": 2.546875, "learning_rate": 1.2745762711864407e-05, "loss": 1.2066, "step": 560 }, { "epoch": 0.4344512195121951, "grad_norm": 2.4375, "learning_rate": 1.2576271186440679e-05, "loss": 1.1543, "step": 570 }, { "epoch": 0.4420731707317073, "grad_norm": 2.640625, "learning_rate": 1.240677966101695e-05, "loss": 1.1703, "step": 580 }, { "epoch": 0.4496951219512195, "grad_norm": 3.09375, "learning_rate": 1.2237288135593222e-05, "loss": 1.1566, "step": 590 }, { "epoch": 0.4573170731707317, "grad_norm": 2.578125, "learning_rate": 1.2067796610169493e-05, "loss": 1.1623, "step": 600 }, { "epoch": 0.4649390243902439, "grad_norm": 2.734375, "learning_rate": 1.1898305084745763e-05, "loss": 1.198, "step": 610 }, { "epoch": 0.4725609756097561, "grad_norm": 2.78125, "learning_rate": 1.1728813559322034e-05, "loss": 1.1971, "step": 620 }, { "epoch": 0.4801829268292683, "grad_norm": 2.546875, "learning_rate": 1.1559322033898306e-05, "loss": 1.159, "step": 630 }, { "epoch": 0.4878048780487805, "grad_norm": 2.84375, "learning_rate": 1.1389830508474577e-05, "loss": 1.184, "step": 640 }, { "epoch": 0.4954268292682927, "grad_norm": 2.578125, "learning_rate": 1.1220338983050849e-05, "loss": 1.1859, "step": 650 }, { "epoch": 0.5030487804878049, "grad_norm": 2.609375, "learning_rate": 1.1050847457627118e-05, "loss": 1.1434, "step": 660 }, { "epoch": 0.510670731707317, "grad_norm": 2.671875, "learning_rate": 1.088135593220339e-05, "loss": 1.1775, "step": 670 }, { "epoch": 0.5182926829268293, "grad_norm": 2.515625, "learning_rate": 1.0711864406779661e-05, "loss": 1.15, "step": 680 }, { "epoch": 0.5259146341463414, "grad_norm": 2.609375, "learning_rate": 1.0542372881355933e-05, "loss": 1.1613, "step": 690 }, { "epoch": 0.5335365853658537, "grad_norm": 2.5625, "learning_rate": 1.0372881355932204e-05, "loss": 1.1602, "step": 700 }, { "epoch": 0.5411585365853658, "grad_norm": 2.671875, "learning_rate": 1.0203389830508474e-05, "loss": 1.1879, "step": 710 }, { "epoch": 0.5487804878048781, "grad_norm": 2.625, "learning_rate": 1.0033898305084746e-05, "loss": 1.1941, "step": 720 }, { "epoch": 0.5564024390243902, "grad_norm": 2.8125, "learning_rate": 9.864406779661017e-06, "loss": 1.1574, "step": 730 }, { "epoch": 0.5640243902439024, "grad_norm": 2.75, "learning_rate": 9.69491525423729e-06, "loss": 1.1793, "step": 740 }, { "epoch": 0.5716463414634146, "grad_norm": 2.703125, "learning_rate": 9.52542372881356e-06, "loss": 1.1711, "step": 750 }, { "epoch": 0.5792682926829268, "grad_norm": 2.703125, "learning_rate": 9.355932203389831e-06, "loss": 1.1875, "step": 760 }, { "epoch": 0.586890243902439, "grad_norm": 2.8125, "learning_rate": 9.186440677966101e-06, "loss": 1.19, "step": 770 }, { "epoch": 0.5945121951219512, "grad_norm": 2.9375, "learning_rate": 9.016949152542374e-06, "loss": 1.141, "step": 780 }, { "epoch": 0.6021341463414634, "grad_norm": 2.75, "learning_rate": 8.847457627118646e-06, "loss": 1.1699, "step": 790 }, { "epoch": 0.6097560975609756, "grad_norm": 2.65625, "learning_rate": 8.677966101694915e-06, "loss": 1.1809, "step": 800 }, { "epoch": 0.6173780487804879, "grad_norm": 2.703125, "learning_rate": 8.508474576271187e-06, "loss": 1.1824, "step": 810 }, { "epoch": 0.625, "grad_norm": 2.796875, "learning_rate": 8.338983050847458e-06, "loss": 1.1729, "step": 820 }, { "epoch": 0.6326219512195121, "grad_norm": 2.765625, "learning_rate": 8.16949152542373e-06, "loss": 1.1113, "step": 830 }, { "epoch": 0.6402439024390244, "grad_norm": 2.5, "learning_rate": 8.000000000000001e-06, "loss": 1.1434, "step": 840 }, { "epoch": 0.6478658536585366, "grad_norm": 2.515625, "learning_rate": 7.830508474576271e-06, "loss": 1.1643, "step": 850 }, { "epoch": 0.6554878048780488, "grad_norm": 2.65625, "learning_rate": 7.661016949152543e-06, "loss": 1.1738, "step": 860 }, { "epoch": 0.663109756097561, "grad_norm": 2.609375, "learning_rate": 7.491525423728814e-06, "loss": 1.125, "step": 870 }, { "epoch": 0.6707317073170732, "grad_norm": 2.5625, "learning_rate": 7.3220338983050855e-06, "loss": 1.1723, "step": 880 }, { "epoch": 0.6783536585365854, "grad_norm": 2.546875, "learning_rate": 7.152542372881357e-06, "loss": 1.1617, "step": 890 }, { "epoch": 0.6859756097560976, "grad_norm": 2.6875, "learning_rate": 6.9830508474576275e-06, "loss": 1.1445, "step": 900 }, { "epoch": 0.6935975609756098, "grad_norm": 2.546875, "learning_rate": 6.813559322033899e-06, "loss": 1.1807, "step": 910 }, { "epoch": 0.7012195121951219, "grad_norm": 2.65625, "learning_rate": 6.64406779661017e-06, "loss": 1.1865, "step": 920 }, { "epoch": 0.7088414634146342, "grad_norm": 2.734375, "learning_rate": 6.474576271186441e-06, "loss": 1.198, "step": 930 }, { "epoch": 0.7164634146341463, "grad_norm": 2.765625, "learning_rate": 6.3050847457627125e-06, "loss": 1.1734, "step": 940 }, { "epoch": 0.7240853658536586, "grad_norm": 2.609375, "learning_rate": 6.135593220338983e-06, "loss": 1.1633, "step": 950 }, { "epoch": 0.7317073170731707, "grad_norm": 2.578125, "learning_rate": 5.9661016949152555e-06, "loss": 1.1953, "step": 960 }, { "epoch": 0.739329268292683, "grad_norm": 2.5, "learning_rate": 5.796610169491525e-06, "loss": 1.1879, "step": 970 }, { "epoch": 0.7469512195121951, "grad_norm": 2.609375, "learning_rate": 5.6271186440677975e-06, "loss": 1.1664, "step": 980 }, { "epoch": 0.7545731707317073, "grad_norm": 2.765625, "learning_rate": 5.457627118644067e-06, "loss": 1.1789, "step": 990 }, { "epoch": 0.7621951219512195, "grad_norm": 2.890625, "learning_rate": 5.28813559322034e-06, "loss": 1.1738, "step": 1000 }, { "epoch": 0.7698170731707317, "grad_norm": 2.6875, "learning_rate": 5.118644067796611e-06, "loss": 1.1582, "step": 1010 }, { "epoch": 0.7774390243902439, "grad_norm": 2.9375, "learning_rate": 4.949152542372882e-06, "loss": 1.1391, "step": 1020 }, { "epoch": 0.7850609756097561, "grad_norm": 2.578125, "learning_rate": 4.779661016949153e-06, "loss": 1.1652, "step": 1030 }, { "epoch": 0.7926829268292683, "grad_norm": 2.703125, "learning_rate": 4.610169491525424e-06, "loss": 1.1426, "step": 1040 }, { "epoch": 0.8003048780487805, "grad_norm": 2.71875, "learning_rate": 4.440677966101695e-06, "loss": 1.1713, "step": 1050 }, { "epoch": 0.8079268292682927, "grad_norm": 2.515625, "learning_rate": 4.271186440677967e-06, "loss": 1.1727, "step": 1060 }, { "epoch": 0.8155487804878049, "grad_norm": 2.59375, "learning_rate": 4.101694915254237e-06, "loss": 1.1695, "step": 1070 }, { "epoch": 0.823170731707317, "grad_norm": 2.671875, "learning_rate": 3.932203389830509e-06, "loss": 1.1672, "step": 1080 }, { "epoch": 0.8307926829268293, "grad_norm": 2.734375, "learning_rate": 3.76271186440678e-06, "loss": 1.2016, "step": 1090 }, { "epoch": 0.8384146341463414, "grad_norm": 2.90625, "learning_rate": 3.5932203389830512e-06, "loss": 1.1727, "step": 1100 }, { "epoch": 0.8460365853658537, "grad_norm": 2.84375, "learning_rate": 3.4237288135593223e-06, "loss": 1.1799, "step": 1110 }, { "epoch": 0.8536585365853658, "grad_norm": 2.8125, "learning_rate": 3.2542372881355933e-06, "loss": 1.1555, "step": 1120 }, { "epoch": 0.8612804878048781, "grad_norm": 2.6875, "learning_rate": 3.0847457627118648e-06, "loss": 1.1678, "step": 1130 }, { "epoch": 0.8689024390243902, "grad_norm": 2.59375, "learning_rate": 2.915254237288136e-06, "loss": 1.1578, "step": 1140 }, { "epoch": 0.8765243902439024, "grad_norm": 2.375, "learning_rate": 2.745762711864407e-06, "loss": 1.1535, "step": 1150 }, { "epoch": 0.8841463414634146, "grad_norm": 2.703125, "learning_rate": 2.576271186440678e-06, "loss": 1.1602, "step": 1160 }, { "epoch": 0.8917682926829268, "grad_norm": 2.78125, "learning_rate": 2.4067796610169493e-06, "loss": 1.1852, "step": 1170 }, { "epoch": 0.899390243902439, "grad_norm": 2.59375, "learning_rate": 2.2372881355932204e-06, "loss": 1.1605, "step": 1180 }, { "epoch": 0.9070121951219512, "grad_norm": 2.59375, "learning_rate": 2.0677966101694914e-06, "loss": 1.1451, "step": 1190 }, { "epoch": 0.9146341463414634, "grad_norm": 2.5625, "learning_rate": 1.8983050847457629e-06, "loss": 1.1496, "step": 1200 }, { "epoch": 0.9222560975609756, "grad_norm": 2.859375, "learning_rate": 1.728813559322034e-06, "loss": 1.1582, "step": 1210 }, { "epoch": 0.9298780487804879, "grad_norm": 2.65625, "learning_rate": 1.5593220338983054e-06, "loss": 1.227, "step": 1220 }, { "epoch": 0.9375, "grad_norm": 2.703125, "learning_rate": 1.3898305084745764e-06, "loss": 1.1977, "step": 1230 }, { "epoch": 0.9451219512195121, "grad_norm": 2.671875, "learning_rate": 1.2203389830508477e-06, "loss": 1.1885, "step": 1240 }, { "epoch": 0.9527439024390244, "grad_norm": 2.6875, "learning_rate": 1.0508474576271187e-06, "loss": 1.1549, "step": 1250 }, { "epoch": 0.9603658536585366, "grad_norm": 2.53125, "learning_rate": 8.813559322033899e-07, "loss": 1.1781, "step": 1260 }, { "epoch": 0.9679878048780488, "grad_norm": 2.5625, "learning_rate": 7.118644067796611e-07, "loss": 1.1957, "step": 1270 }, { "epoch": 0.975609756097561, "grad_norm": 2.640625, "learning_rate": 5.423728813559322e-07, "loss": 1.1898, "step": 1280 }, { "epoch": 0.9832317073170732, "grad_norm": 2.625, "learning_rate": 3.7288135593220347e-07, "loss": 1.1531, "step": 1290 }, { "epoch": 0.9908536585365854, "grad_norm": 2.6875, "learning_rate": 2.0338983050847458e-07, "loss": 1.1773, "step": 1300 }, { "epoch": 0.9984756097560976, "grad_norm": 2.640625, "learning_rate": 3.3898305084745764e-08, "loss": 1.0754, "step": 1310 }, { "epoch": 1.0, "step": 1312, "total_flos": 4.7710598211200614e+17, "train_loss": 1.2026635146722562, "train_runtime": 2875.3707, "train_samples_per_second": 14.591, "train_steps_per_second": 0.456 } ], "logging_steps": 10, "max_steps": 1312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.7710598211200614e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }