{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.955555555555556, "eval_steps": 1000, "global_step": 840, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12, "grad_norm": 0.6363710165023804, "learning_rate": 2.5e-05, "loss": 2.9339, "step": 10 }, { "epoch": 0.24, "grad_norm": 0.7148603796958923, "learning_rate": 5e-05, "loss": 2.8737, "step": 20 }, { "epoch": 0.36, "grad_norm": 0.973531186580658, "learning_rate": 4.998165452627025e-05, "loss": 2.7756, "step": 30 }, { "epoch": 0.47, "grad_norm": 1.0926612615585327, "learning_rate": 4.993379398069312e-05, "loss": 2.6328, "step": 40 }, { "epoch": 0.59, "grad_norm": 1.2680305242538452, "learning_rate": 4.9845854359471646e-05, "loss": 2.5491, "step": 50 }, { "epoch": 0.71, "grad_norm": 1.4471039772033691, "learning_rate": 4.972145002077457e-05, "loss": 2.5006, "step": 60 }, { "epoch": 0.83, "grad_norm": 1.439332365989685, "learning_rate": 4.956076354512411e-05, "loss": 2.4194, "step": 70 }, { "epoch": 0.95, "grad_norm": 1.5911873579025269, "learning_rate": 4.9364030762081674e-05, "loss": 2.4886, "step": 80 }, { "epoch": 1.07, "grad_norm": 1.2575428485870361, "learning_rate": 4.913154040413551e-05, "loss": 2.3729, "step": 90 }, { "epoch": 1.19, "grad_norm": 1.4824930429458618, "learning_rate": 4.8863633682945956e-05, "loss": 2.3385, "step": 100 }, { "epoch": 1.3, "grad_norm": 1.5880047082901, "learning_rate": 4.856070378857025e-05, "loss": 2.2935, "step": 110 }, { "epoch": 1.42, "grad_norm": 2.0501768589019775, "learning_rate": 4.8223195312401935e-05, "loss": 2.2132, "step": 120 }, { "epoch": 1.54, "grad_norm": 1.5746724605560303, "learning_rate": 4.7851603594671653e-05, "loss": 2.3534, "step": 130 }, { "epoch": 1.66, "grad_norm": 1.7051725387573242, "learning_rate": 4.744647399746709e-05, "loss": 2.3176, "step": 140 }, { "epoch": 1.78, "grad_norm": 1.5449475049972534, "learning_rate": 4.700840110433883e-05, "loss": 2.3376, "step": 150 }, { "epoch": 1.9, "grad_norm": 3.40295147895813, "learning_rate": 4.65380278476671e-05, "loss": 2.3002, "step": 160 }, { "epoch": 2.01, "grad_norm": 1.291429877281189, "learning_rate": 4.603604456506977e-05, "loss": 2.3059, "step": 170 }, { "epoch": 2.13, "grad_norm": 1.512455701828003, "learning_rate": 4.550318798623673e-05, "loss": 2.2229, "step": 180 }, { "epoch": 2.25, "grad_norm": 1.698573112487793, "learning_rate": 4.4940240151677495e-05, "loss": 2.2402, "step": 190 }, { "epoch": 2.37, "grad_norm": 1.7456746101379395, "learning_rate": 4.434802726496884e-05, "loss": 2.2043, "step": 200 }, { "epoch": 2.49, "grad_norm": 1.8384778499603271, "learning_rate": 4.372741848018724e-05, "loss": 2.2061, "step": 210 }, { "epoch": 2.61, "grad_norm": 1.8294689655303955, "learning_rate": 4.3079324626305294e-05, "loss": 2.1849, "step": 220 }, { "epoch": 2.73, "grad_norm": 2.516619920730591, "learning_rate": 4.240469687042467e-05, "loss": 2.1896, "step": 230 }, { "epoch": 2.84, "grad_norm": 2.209205150604248, "learning_rate": 4.17045253218072e-05, "loss": 2.1794, "step": 240 }, { "epoch": 2.96, "grad_norm": 2.061786651611328, "learning_rate": 4.0979837578752986e-05, "loss": 2.2459, "step": 250 }, { "epoch": 3.08, "grad_norm": 2.4832069873809814, "learning_rate": 4.023169722045824e-05, "loss": 2.1904, "step": 260 }, { "epoch": 3.2, "grad_norm": 1.6924707889556885, "learning_rate": 3.946120224606611e-05, "loss": 2.1928, "step": 270 }, { "epoch": 3.32, "grad_norm": 1.4714900255203247, "learning_rate": 3.8669483463201505e-05, "loss": 2.1379, "step": 280 }, { "epoch": 3.44, "grad_norm": 1.8269611597061157, "learning_rate": 3.785770282835504e-05, "loss": 2.2067, "step": 290 }, { "epoch": 3.56, "grad_norm": 2.2453367710113525, "learning_rate": 3.702705174155156e-05, "loss": 2.1465, "step": 300 }, { "epoch": 3.67, "grad_norm": 2.802905797958374, "learning_rate": 3.617874929780641e-05, "loss": 2.2089, "step": 310 }, { "epoch": 3.79, "grad_norm": 1.8734266757965088, "learning_rate": 3.5314040497935315e-05, "loss": 2.2424, "step": 320 }, { "epoch": 3.91, "grad_norm": 1.577213168144226, "learning_rate": 3.443419442134402e-05, "loss": 2.1432, "step": 330 }, { "epoch": 4.03, "grad_norm": 2.034276008605957, "learning_rate": 3.3540502363479274e-05, "loss": 2.1789, "step": 340 }, { "epoch": 4.15, "grad_norm": 2.3561315536499023, "learning_rate": 3.263427594067468e-05, "loss": 2.1967, "step": 350 }, { "epoch": 4.27, "grad_norm": 3.1146442890167236, "learning_rate": 3.171684516517287e-05, "loss": 2.1886, "step": 360 }, { "epoch": 4.39, "grad_norm": 1.9383080005645752, "learning_rate": 3.0789556493149094e-05, "loss": 2.1295, "step": 370 }, { "epoch": 4.5, "grad_norm": 2.770754814147949, "learning_rate": 2.9853770848601164e-05, "loss": 2.12, "step": 380 }, { "epoch": 4.62, "grad_norm": 2.3625290393829346, "learning_rate": 2.8910861626005776e-05, "loss": 2.0874, "step": 390 }, { "epoch": 4.74, "grad_norm": 2.091344118118286, "learning_rate": 2.7962212674672738e-05, "loss": 2.1171, "step": 400 }, { "epoch": 4.86, "grad_norm": 2.646287679672241, "learning_rate": 2.7009216267755293e-05, "loss": 2.2145, "step": 410 }, { "epoch": 4.98, "grad_norm": 2.7173330783843994, "learning_rate": 2.6053271058897244e-05, "loss": 2.1575, "step": 420 }, { "epoch": 5.1, "grad_norm": 2.383789300918579, "learning_rate": 2.5095780029515896e-05, "loss": 2.1603, "step": 430 }, { "epoch": 5.21, "grad_norm": 1.9369804859161377, "learning_rate": 2.4138148429733323e-05, "loss": 2.2005, "step": 440 }, { "epoch": 5.33, "grad_norm": 4.025020122528076, "learning_rate": 2.3277321440960733e-05, "loss": 2.1382, "step": 450 }, { "epoch": 5.45, "grad_norm": 2.6233372688293457, "learning_rate": 2.2323293298151814e-05, "loss": 2.1584, "step": 460 }, { "epoch": 5.57, "grad_norm": 1.9993385076522827, "learning_rate": 2.1373193591541378e-05, "loss": 2.135, "step": 470 }, { "epoch": 5.69, "grad_norm": 2.5544209480285645, "learning_rate": 2.042841672346608e-05, "loss": 2.1118, "step": 480 }, { "epoch": 5.81, "grad_norm": 2.0512688159942627, "learning_rate": 1.9490349284263033e-05, "loss": 2.0913, "step": 490 }, { "epoch": 5.93, "grad_norm": 2.2882919311523438, "learning_rate": 1.8560368017257228e-05, "loss": 2.0777, "step": 500 }, { "epoch": 6.04, "grad_norm": 2.6118929386138916, "learning_rate": 1.7639837798200923e-05, "loss": 2.1112, "step": 510 }, { "epoch": 6.16, "grad_norm": 2.71341609954834, "learning_rate": 1.6730109632130198e-05, "loss": 2.1188, "step": 520 }, { "epoch": 6.28, "grad_norm": 2.200782299041748, "learning_rate": 1.5832518670578804e-05, "loss": 2.1232, "step": 530 }, { "epoch": 6.4, "grad_norm": 2.1322591304779053, "learning_rate": 1.4948382252059156e-05, "loss": 2.1321, "step": 540 }, { "epoch": 6.52, "grad_norm": 2.813279867172241, "learning_rate": 1.4078997968686425e-05, "loss": 2.13, "step": 550 }, { "epoch": 6.64, "grad_norm": 2.4658961296081543, "learning_rate": 1.3225641761783125e-05, "loss": 2.1187, "step": 560 }, { "epoch": 6.76, "grad_norm": 2.3500850200653076, "learning_rate": 1.2389566049259338e-05, "loss": 2.1212, "step": 570 }, { "epoch": 6.87, "grad_norm": 2.4541094303131104, "learning_rate": 1.1571997887516672e-05, "loss": 2.0698, "step": 580 }, { "epoch": 6.99, "grad_norm": 2.5647811889648438, "learning_rate": 1.0774137170573825e-05, "loss": 2.0031, "step": 590 }, { "epoch": 7.11, "grad_norm": 2.4078140258789062, "learning_rate": 9.997154869056589e-06, "loss": 2.1054, "step": 600 }, { "epoch": 7.23, "grad_norm": 2.722888946533203, "learning_rate": 9.24219131163705e-06, "loss": 2.0498, "step": 610 }, { "epoch": 7.35, "grad_norm": 2.400818347930908, "learning_rate": 8.510354511443974e-06, "loss": 2.0875, "step": 620 }, { "epoch": 7.47, "grad_norm": 2.3817691802978516, "learning_rate": 7.80271853990076e-06, "loss": 2.0564, "step": 630 }, { "epoch": 7.59, "grad_norm": 2.6328699588775635, "learning_rate": 7.1203219503774875e-06, "loss": 2.1291, "step": 640 }, { "epoch": 7.7, "grad_norm": 2.3237149715423584, "learning_rate": 6.464166253970671e-06, "loss": 2.0729, "step": 650 }, { "epoch": 7.82, "grad_norm": 2.3584659099578857, "learning_rate": 5.835214449647602e-06, "loss": 2.095, "step": 660 }, { "epoch": 7.94, "grad_norm": 2.5625736713409424, "learning_rate": 5.234389610912551e-06, "loss": 2.1288, "step": 670 }, { "epoch": 8.06, "grad_norm": 2.6210007667541504, "learning_rate": 4.662573531069139e-06, "loss": 2.1966, "step": 680 }, { "epoch": 8.18, "grad_norm": 2.7513816356658936, "learning_rate": 4.120605429067054e-06, "loss": 2.0396, "step": 690 }, { "epoch": 8.3, "grad_norm": 3.0069968700408936, "learning_rate": 3.6092807178324887e-06, "loss": 2.1039, "step": 700 }, { "epoch": 8.41, "grad_norm": 2.35355806350708, "learning_rate": 3.1293498368900417e-06, "loss": 2.1174, "step": 710 }, { "epoch": 8.53, "grad_norm": 2.615985155105591, "learning_rate": 2.6815171509891852e-06, "loss": 2.0799, "step": 720 }, { "epoch": 8.65, "grad_norm": 2.601727247238159, "learning_rate": 2.2664399163518786e-06, "loss": 2.0811, "step": 730 }, { "epoch": 8.77, "grad_norm": 2.4444971084594727, "learning_rate": 1.8847273160584378e-06, "loss": 2.0565, "step": 740 }, { "epoch": 8.89, "grad_norm": 3.528599977493286, "learning_rate": 1.5369395659873303e-06, "loss": 2.0009, "step": 750 }, { "epoch": 9.01, "grad_norm": 2.0438976287841797, "learning_rate": 1.2235870926211619e-06, "loss": 2.1047, "step": 760 }, { "epoch": 9.13, "grad_norm": 2.847148895263672, "learning_rate": 9.451297839253914e-07, "loss": 2.0975, "step": 770 }, { "epoch": 9.24, "grad_norm": 2.4994664192199707, "learning_rate": 7.019763143993441e-07, "loss": 2.0354, "step": 780 }, { "epoch": 9.36, "grad_norm": 2.7058956623077393, "learning_rate": 4.944835452900199e-07, "loss": 2.0424, "step": 790 }, { "epoch": 9.48, "grad_norm": 2.4226083755493164, "learning_rate": 3.229560008490007e-07, "loss": 2.1076, "step": 800 }, { "epoch": 9.6, "grad_norm": 2.2777678966522217, "learning_rate": 1.876454214011253e-07, "loss": 2.1511, "step": 810 }, { "epoch": 9.72, "grad_norm": 2.749253273010254, "learning_rate": 8.875039388084316e-08, "loss": 2.0381, "step": 820 }, { "epoch": 9.84, "grad_norm": 2.434145450592041, "learning_rate": 2.6416060378503527e-08, "loss": 2.1139, "step": 830 }, { "epoch": 9.96, "grad_norm": 2.8431334495544434, "learning_rate": 7.339051243254735e-10, "loss": 2.1265, "step": 840 }, { "epoch": 9.96, "step": 840, "total_flos": 1.0731730608083436e+18, "train_loss": 2.1995077201298305, "train_runtime": 13749.9632, "train_samples_per_second": 1.964, "train_steps_per_second": 0.061 } ], "logging_steps": 10, "max_steps": 840, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "total_flos": 1.0731730608083436e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }