{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.113590263691684, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 16.835479736328125, "learning_rate": 0.0002, "loss": 3.766, "step": 10 }, { "epoch": 0.16, "grad_norm": 15.441985130310059, "learning_rate": 0.0002, "loss": 1.4106, "step": 20 }, { "epoch": 0.24, "grad_norm": 13.51206111907959, "learning_rate": 0.0002, "loss": 1.1355, "step": 30 }, { "epoch": 0.32, "grad_norm": 14.920987129211426, "learning_rate": 0.0002, "loss": 0.9941, "step": 40 }, { "epoch": 0.41, "grad_norm": 8.289106369018555, "learning_rate": 0.0002, "loss": 0.8887, "step": 50 }, { "epoch": 0.49, "grad_norm": 8.003510475158691, "learning_rate": 0.0002, "loss": 0.8275, "step": 60 }, { "epoch": 0.57, "grad_norm": 7.877600193023682, "learning_rate": 0.0002, "loss": 0.798, "step": 70 }, { "epoch": 0.65, "grad_norm": 2.8833744525909424, "learning_rate": 0.0002, "loss": 0.8464, "step": 80 }, { "epoch": 0.73, "grad_norm": 8.206995964050293, "learning_rate": 0.0002, "loss": 0.7849, "step": 90 }, { "epoch": 0.81, "grad_norm": 7.678927421569824, "learning_rate": 0.0002, "loss": 0.7184, "step": 100 }, { "epoch": 0.89, "grad_norm": 7.089763164520264, "learning_rate": 0.0002, "loss": 0.7099, "step": 110 }, { "epoch": 0.97, "grad_norm": 5.643054485321045, "learning_rate": 0.0002, "loss": 0.6989, "step": 120 }, { "epoch": 1.05, "grad_norm": 7.985189914703369, "learning_rate": 0.0002, "loss": 0.6394, "step": 130 }, { "epoch": 1.14, "grad_norm": 7.676026344299316, "learning_rate": 0.0002, "loss": 0.5882, "step": 140 }, { "epoch": 1.22, "grad_norm": 8.714900016784668, "learning_rate": 0.0002, "loss": 0.5707, "step": 150 }, { "epoch": 1.3, "grad_norm": 5.221066474914551, "learning_rate": 0.0002, "loss": 0.5345, "step": 160 }, { "epoch": 1.38, "grad_norm": 9.008663177490234, "learning_rate": 0.0002, "loss": 0.5568, "step": 170 }, { "epoch": 1.46, "grad_norm": 6.81847620010376, "learning_rate": 0.0002, "loss": 0.5249, "step": 180 }, { "epoch": 1.54, "grad_norm": 0.8458152413368225, "learning_rate": 0.0002, "loss": 0.491, "step": 190 }, { "epoch": 1.62, "grad_norm": 7.401742935180664, "learning_rate": 0.0002, "loss": 0.514, "step": 200 }, { "epoch": 1.7, "grad_norm": 7.641871452331543, "learning_rate": 0.0002, "loss": 0.5557, "step": 210 }, { "epoch": 1.78, "grad_norm": 5.442512512207031, "learning_rate": 0.0002, "loss": 0.5314, "step": 220 }, { "epoch": 1.87, "grad_norm": 8.104277610778809, "learning_rate": 0.0002, "loss": 0.4907, "step": 230 }, { "epoch": 1.95, "grad_norm": 6.416436195373535, "learning_rate": 0.0002, "loss": 0.5092, "step": 240 }, { "epoch": 2.03, "grad_norm": 3.008178949356079, "learning_rate": 0.0002, "loss": 0.4633, "step": 250 }, { "epoch": 2.11, "grad_norm": 9.349288940429688, "learning_rate": 0.0002, "loss": 0.378, "step": 260 }, { "epoch": 2.19, "grad_norm": 6.459526062011719, "learning_rate": 0.0002, "loss": 0.3765, "step": 270 }, { "epoch": 2.27, "grad_norm": 7.634103775024414, "learning_rate": 0.0002, "loss": 0.3819, "step": 280 }, { "epoch": 2.35, "grad_norm": 9.084547996520996, "learning_rate": 0.0002, "loss": 0.3703, "step": 290 }, { "epoch": 2.43, "grad_norm": 2.535707473754883, "learning_rate": 0.0002, "loss": 0.3501, "step": 300 }, { "epoch": 2.52, "grad_norm": 7.023160457611084, "learning_rate": 0.0002, "loss": 0.3491, "step": 310 }, { "epoch": 2.6, "grad_norm": 9.15241527557373, "learning_rate": 0.0002, "loss": 0.3592, "step": 320 }, { "epoch": 2.68, "grad_norm": 3.0726027488708496, "learning_rate": 0.0002, "loss": 0.3641, "step": 330 }, { "epoch": 2.76, "grad_norm": 5.8857879638671875, "learning_rate": 0.0002, "loss": 0.3553, "step": 340 }, { "epoch": 2.84, "grad_norm": 8.316734313964844, "learning_rate": 0.0002, "loss": 0.36, "step": 350 }, { "epoch": 2.92, "grad_norm": 3.8568618297576904, "learning_rate": 0.0002, "loss": 0.3607, "step": 360 }, { "epoch": 3.0, "grad_norm": 6.664178848266602, "learning_rate": 0.0002, "loss": 0.3716, "step": 370 }, { "epoch": 3.08, "grad_norm": 9.07357406616211, "learning_rate": 0.0002, "loss": 0.242, "step": 380 }, { "epoch": 3.16, "grad_norm": 4.3498921394348145, "learning_rate": 0.0002, "loss": 0.2588, "step": 390 }, { "epoch": 3.25, "grad_norm": 6.539889812469482, "learning_rate": 0.0002, "loss": 0.2523, "step": 400 }, { "epoch": 3.33, "grad_norm": 8.80722427368164, "learning_rate": 0.0002, "loss": 0.2388, "step": 410 }, { "epoch": 3.41, "grad_norm": 5.052772521972656, "learning_rate": 0.0002, "loss": 0.2524, "step": 420 }, { "epoch": 3.49, "grad_norm": 5.22416877746582, "learning_rate": 0.0002, "loss": 0.2452, "step": 430 }, { "epoch": 3.57, "grad_norm": 8.989995956420898, "learning_rate": 0.0002, "loss": 0.2558, "step": 440 }, { "epoch": 3.65, "grad_norm": 2.628777027130127, "learning_rate": 0.0002, "loss": 0.2449, "step": 450 }, { "epoch": 3.73, "grad_norm": 5.890882968902588, "learning_rate": 0.0002, "loss": 0.2591, "step": 460 }, { "epoch": 3.81, "grad_norm": 8.577589988708496, "learning_rate": 0.0002, "loss": 0.2557, "step": 470 }, { "epoch": 3.89, "grad_norm": 5.092458724975586, "learning_rate": 0.0002, "loss": 0.2603, "step": 480 }, { "epoch": 3.98, "grad_norm": 5.97870397567749, "learning_rate": 0.0002, "loss": 0.2676, "step": 490 }, { "epoch": 4.06, "grad_norm": 8.259415626525879, "learning_rate": 0.0002, "loss": 0.2051, "step": 500 }, { "epoch": 4.14, "grad_norm": 7.2230072021484375, "learning_rate": 0.0002, "loss": 0.1467, "step": 510 }, { "epoch": 4.22, "grad_norm": 6.01918888092041, "learning_rate": 0.0002, "loss": 0.1713, "step": 520 }, { "epoch": 4.3, "grad_norm": 8.41965103149414, "learning_rate": 0.0002, "loss": 0.1803, "step": 530 }, { "epoch": 4.38, "grad_norm": 4.342984676361084, "learning_rate": 0.0002, "loss": 0.1769, "step": 540 }, { "epoch": 4.46, "grad_norm": 6.167740821838379, "learning_rate": 0.0002, "loss": 0.1763, "step": 550 }, { "epoch": 4.54, "grad_norm": 8.30461597442627, "learning_rate": 0.0002, "loss": 0.1716, "step": 560 }, { "epoch": 4.62, "grad_norm": 3.77854323387146, "learning_rate": 0.0002, "loss": 0.1679, "step": 570 }, { "epoch": 4.71, "grad_norm": 5.807821273803711, "learning_rate": 0.0002, "loss": 0.1775, "step": 580 }, { "epoch": 4.79, "grad_norm": 7.620412349700928, "learning_rate": 0.0002, "loss": 0.1769, "step": 590 }, { "epoch": 4.87, "grad_norm": 6.027496814727783, "learning_rate": 0.0002, "loss": 0.1828, "step": 600 }, { "epoch": 4.95, "grad_norm": 5.835544109344482, "learning_rate": 0.0002, "loss": 0.1783, "step": 610 }, { "epoch": 5.03, "grad_norm": 7.455280780792236, "learning_rate": 0.0002, "loss": 0.1545, "step": 620 }, { "epoch": 5.11, "grad_norm": 3.6531035900115967, "learning_rate": 0.0002, "loss": 0.1213, "step": 630 }, { "epoch": 5.19, "grad_norm": 6.005002498626709, "learning_rate": 0.0002, "loss": 0.1211, "step": 640 }, { "epoch": 5.27, "grad_norm": 8.0129976272583, "learning_rate": 0.0002, "loss": 0.1268, "step": 650 }, { "epoch": 5.35, "grad_norm": 4.5181169509887695, "learning_rate": 0.0002, "loss": 0.1215, "step": 660 }, { "epoch": 5.44, "grad_norm": 5.109263896942139, "learning_rate": 0.0002, "loss": 0.1264, "step": 670 }, { "epoch": 5.52, "grad_norm": 7.003955364227295, "learning_rate": 0.0002, "loss": 0.1332, "step": 680 }, { "epoch": 5.6, "grad_norm": 4.416803359985352, "learning_rate": 0.0002, "loss": 0.1333, "step": 690 }, { "epoch": 5.68, "grad_norm": 4.775448322296143, "learning_rate": 0.0002, "loss": 0.1384, "step": 700 }, { "epoch": 5.76, "grad_norm": 6.8711018562316895, "learning_rate": 0.0002, "loss": 0.1388, "step": 710 }, { "epoch": 5.84, "grad_norm": 4.3299760818481445, "learning_rate": 0.0002, "loss": 0.1392, "step": 720 }, { "epoch": 5.92, "grad_norm": 4.7512006759643555, "learning_rate": 0.0002, "loss": 0.1477, "step": 730 }, { "epoch": 6.0, "grad_norm": 6.701217174530029, "learning_rate": 0.0002, "loss": 0.1401, "step": 740 }, { "epoch": 6.09, "grad_norm": 4.163561820983887, "learning_rate": 0.0002, "loss": 0.099, "step": 750 }, { "epoch": 6.17, "grad_norm": 4.784251689910889, "learning_rate": 0.0002, "loss": 0.1025, "step": 760 }, { "epoch": 6.25, "grad_norm": 7.253482341766357, "learning_rate": 0.0002, "loss": 0.1019, "step": 770 }, { "epoch": 6.33, "grad_norm": 4.389810085296631, "learning_rate": 0.0002, "loss": 0.1052, "step": 780 }, { "epoch": 6.41, "grad_norm": 4.857180595397949, "learning_rate": 0.0002, "loss": 0.1065, "step": 790 }, { "epoch": 6.49, "grad_norm": 6.876203536987305, "learning_rate": 0.0002, "loss": 0.1092, "step": 800 }, { "epoch": 6.57, "grad_norm": 4.4614715576171875, "learning_rate": 0.0002, "loss": 0.112, "step": 810 }, { "epoch": 6.65, "grad_norm": 5.019026756286621, "learning_rate": 0.0002, "loss": 0.1127, "step": 820 }, { "epoch": 6.73, "grad_norm": 6.864689350128174, "learning_rate": 0.0002, "loss": 0.1131, "step": 830 }, { "epoch": 6.82, "grad_norm": 4.057292938232422, "learning_rate": 0.0002, "loss": 0.113, "step": 840 }, { "epoch": 6.9, "grad_norm": 4.780605792999268, "learning_rate": 0.0002, "loss": 0.1122, "step": 850 }, { "epoch": 6.98, "grad_norm": 6.377821445465088, "learning_rate": 0.0002, "loss": 0.1153, "step": 860 }, { "epoch": 7.06, "grad_norm": 3.6482489109039307, "learning_rate": 0.0002, "loss": 0.0962, "step": 870 }, { "epoch": 7.14, "grad_norm": 4.848484516143799, "learning_rate": 0.0002, "loss": 0.0892, "step": 880 }, { "epoch": 7.22, "grad_norm": 6.297725677490234, "learning_rate": 0.0002, "loss": 0.0913, "step": 890 }, { "epoch": 7.3, "grad_norm": 3.4698007106781006, "learning_rate": 0.0002, "loss": 0.0927, "step": 900 }, { "epoch": 7.38, "grad_norm": 4.614325523376465, "learning_rate": 0.0002, "loss": 0.0933, "step": 910 }, { "epoch": 7.46, "grad_norm": 6.8649163246154785, "learning_rate": 0.0002, "loss": 0.0951, "step": 920 }, { "epoch": 7.55, "grad_norm": 3.5742478370666504, "learning_rate": 0.0002, "loss": 0.0975, "step": 930 }, { "epoch": 7.63, "grad_norm": 4.66102409362793, "learning_rate": 0.0002, "loss": 0.0971, "step": 940 }, { "epoch": 7.71, "grad_norm": 6.615488052368164, "learning_rate": 0.0002, "loss": 0.0992, "step": 950 }, { "epoch": 7.79, "grad_norm": 4.218813419342041, "learning_rate": 0.0002, "loss": 0.1021, "step": 960 }, { "epoch": 7.87, "grad_norm": 4.6225905418396, "learning_rate": 0.0002, "loss": 0.1012, "step": 970 }, { "epoch": 7.95, "grad_norm": 6.005787372589111, "learning_rate": 0.0002, "loss": 0.1059, "step": 980 }, { "epoch": 8.03, "grad_norm": 3.2653019428253174, "learning_rate": 0.0002, "loss": 0.0929, "step": 990 }, { "epoch": 8.11, "grad_norm": 5.000682353973389, "learning_rate": 0.0002, "loss": 0.0787, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 10, "total_flos": 1.9346683868368896e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }