{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 5.208333333333333e-08, "loss": 5.1406, "step": 10 }, { "epoch": 0.13, "learning_rate": 1.0416666666666667e-07, "loss": 4.8324, "step": 20 }, { "epoch": 0.19, "learning_rate": 1.5624999999999999e-07, "loss": 3.282, "step": 30 }, { "epoch": 0.25, "learning_rate": 2.0833333333333333e-07, "loss": 1.166, "step": 40 }, { "epoch": 0.31, "learning_rate": 2.604166666666667e-07, "loss": 0.3744, "step": 50 }, { "epoch": 0.38, "learning_rate": 3.1249999999999997e-07, "loss": 0.2785, "step": 60 }, { "epoch": 0.44, "learning_rate": 3.645833333333333e-07, "loss": 0.2278, "step": 70 }, { "epoch": 0.5, "learning_rate": 4.1666666666666667e-07, "loss": 0.1459, "step": 80 }, { "epoch": 0.57, "learning_rate": 4.6874999999999996e-07, "loss": 0.0876, "step": 90 }, { "epoch": 0.63, "learning_rate": 4.999731868769026e-07, "loss": 0.071, "step": 100 }, { "epoch": 0.69, "learning_rate": 4.996716052911017e-07, "loss": 0.0626, "step": 110 }, { "epoch": 0.75, "learning_rate": 4.990353313429303e-07, "loss": 0.0586, "step": 120 }, { "epoch": 0.82, "learning_rate": 4.980652179769217e-07, "loss": 0.055, "step": 130 }, { "epoch": 0.88, "learning_rate": 4.967625656594781e-07, "loss": 0.0547, "step": 140 }, { "epoch": 0.94, "learning_rate": 4.951291206355559e-07, "loss": 0.0526, "step": 150 }, { "epoch": 1.0, "eval_loss": 0.054711490869522095, "eval_runtime": 21.9924, "eval_samples_per_second": 9.367, "eval_steps_per_second": 0.409, "step": 159 }, { "epoch": 1.01, "learning_rate": 4.93167072587771e-07, "loss": 0.0468, "step": 160 }, { "epoch": 1.07, "learning_rate": 4.908790517010636e-07, "loss": 0.0447, "step": 170 }, { "epoch": 1.13, "learning_rate": 4.882681251368548e-07, "loss": 0.0447, "step": 180 }, { "epoch": 1.19, "learning_rate": 4.853377929214243e-07, "loss": 0.0498, "step": 190 }, { "epoch": 1.26, "learning_rate": 4.820919832540181e-07, "loss": 0.0468, "step": 200 }, { "epoch": 1.32, "learning_rate": 4.785350472409791e-07, "loss": 0.0458, "step": 210 }, { "epoch": 1.38, "learning_rate": 4.7467175306295647e-07, "loss": 0.046, "step": 220 }, { "epoch": 1.45, "learning_rate": 4.70507279583015e-07, "loss": 0.043, "step": 230 }, { "epoch": 1.51, "learning_rate": 4.6604720940421207e-07, "loss": 0.0431, "step": 240 }, { "epoch": 1.57, "learning_rate": 4.612975213859487e-07, "loss": 0.0428, "step": 250 }, { "epoch": 1.64, "learning_rate": 4.5626458262912735e-07, "loss": 0.0468, "step": 260 }, { "epoch": 1.7, "learning_rate": 4.5095513994085974e-07, "loss": 0.0418, "step": 270 }, { "epoch": 1.76, "learning_rate": 4.453763107901675e-07, "loss": 0.0437, "step": 280 }, { "epoch": 1.82, "learning_rate": 4.395355737667985e-07, "loss": 0.0429, "step": 290 }, { "epoch": 1.89, "learning_rate": 4.3344075855595097e-07, "loss": 0.0408, "step": 300 }, { "epoch": 1.95, "learning_rate": 4.271000354423425e-07, "loss": 0.0421, "step": 310 }, { "epoch": 2.0, "eval_loss": 0.04473445564508438, "eval_runtime": 22.0105, "eval_samples_per_second": 9.359, "eval_steps_per_second": 0.409, "step": 318 }, { "epoch": 2.01, "learning_rate": 4.2052190435769554e-07, "loss": 0.0355, "step": 320 }, { "epoch": 2.08, "learning_rate": 4.137151834863213e-07, "loss": 0.0348, "step": 330 }, { "epoch": 2.14, "learning_rate": 4.0668899744407567e-07, "loss": 0.0326, "step": 340 }, { "epoch": 2.2, "learning_rate": 3.994527650465352e-07, "loss": 0.0304, "step": 350 }, { "epoch": 2.26, "learning_rate": 3.920161866827889e-07, "loss": 0.0258, "step": 360 }, { "epoch": 2.33, "learning_rate": 3.8438923131177237e-07, "loss": 0.0317, "step": 370 }, { "epoch": 2.39, "learning_rate": 3.765821230985757e-07, "loss": 0.0326, "step": 380 }, { "epoch": 2.45, "learning_rate": 3.6860532770864005e-07, "loss": 0.0295, "step": 390 }, { "epoch": 2.52, "learning_rate": 3.604695382782159e-07, "loss": 0.0275, "step": 400 }, { "epoch": 2.58, "learning_rate": 3.5218566107988867e-07, "loss": 0.0344, "step": 410 }, { "epoch": 2.64, "learning_rate": 3.4376480090239047e-07, "loss": 0.0341, "step": 420 }, { "epoch": 2.7, "learning_rate": 3.3521824616429284e-07, "loss": 0.0273, "step": 430 }, { "epoch": 2.77, "learning_rate": 3.265574537815398e-07, "loss": 0.0314, "step": 440 }, { "epoch": 2.83, "learning_rate": 3.1779403380910425e-07, "loss": 0.0354, "step": 450 }, { "epoch": 2.89, "learning_rate": 3.0893973387735683e-07, "loss": 0.0336, "step": 460 }, { "epoch": 2.96, "learning_rate": 3.000064234440111e-07, "loss": 0.0285, "step": 470 }, { "epoch": 3.0, "eval_loss": 0.03846818581223488, "eval_runtime": 22.5077, "eval_samples_per_second": 9.152, "eval_steps_per_second": 0.4, "step": 477 }, { "epoch": 3.02, "learning_rate": 2.910060778827554e-07, "loss": 0.0259, "step": 480 }, { "epoch": 3.08, "learning_rate": 2.8195076242990116e-07, "loss": 0.0098, "step": 490 }, { "epoch": 3.14, "learning_rate": 2.7285261601056697e-07, "loss": 0.0153, "step": 500 }, { "epoch": 3.21, "learning_rate": 2.6372383496608186e-07, "loss": 0.0162, "step": 510 }, { "epoch": 3.27, "learning_rate": 2.5457665670441937e-07, "loss": 0.0155, "step": 520 }, { "epoch": 3.33, "learning_rate": 2.454233432955807e-07, "loss": 0.0185, "step": 530 }, { "epoch": 3.4, "learning_rate": 2.3627616503391812e-07, "loss": 0.0138, "step": 540 }, { "epoch": 3.46, "learning_rate": 2.2714738398943308e-07, "loss": 0.0136, "step": 550 }, { "epoch": 3.52, "learning_rate": 2.1804923757009882e-07, "loss": 0.016, "step": 560 }, { "epoch": 3.58, "learning_rate": 2.089939221172446e-07, "loss": 0.0155, "step": 570 }, { "epoch": 3.65, "learning_rate": 1.9999357655598891e-07, "loss": 0.009, "step": 580 }, { "epoch": 3.71, "learning_rate": 1.9106026612264315e-07, "loss": 0.0186, "step": 590 }, { "epoch": 3.77, "learning_rate": 1.8220596619089573e-07, "loss": 0.0109, "step": 600 }, { "epoch": 3.84, "learning_rate": 1.7344254621846017e-07, "loss": 0.016, "step": 610 }, { "epoch": 3.9, "learning_rate": 1.647817538357072e-07, "loss": 0.0111, "step": 620 }, { "epoch": 3.96, "learning_rate": 1.562351990976095e-07, "loss": 0.0165, "step": 630 }, { "epoch": 4.0, "eval_loss": 0.04646956920623779, "eval_runtime": 22.0005, "eval_samples_per_second": 9.363, "eval_steps_per_second": 0.409, "step": 636 }, { "epoch": 4.03, "learning_rate": 1.478143389201113e-07, "loss": 0.0104, "step": 640 }, { "epoch": 4.09, "learning_rate": 1.3953046172178413e-07, "loss": 0.0041, "step": 650 }, { "epoch": 4.15, "learning_rate": 1.3139467229135998e-07, "loss": 0.0041, "step": 660 }, { "epoch": 4.21, "learning_rate": 1.2341787690142435e-07, "loss": 0.0011, "step": 670 }, { "epoch": 4.28, "learning_rate": 1.1561076868822755e-07, "loss": 0.0028, "step": 680 }, { "epoch": 4.34, "learning_rate": 1.0798381331721107e-07, "loss": 0.0024, "step": 690 }, { "epoch": 4.4, "learning_rate": 1.0054723495346482e-07, "loss": 0.0021, "step": 700 }, { "epoch": 4.47, "learning_rate": 9.331100255592436e-08, "loss": 0.0038, "step": 710 }, { "epoch": 4.53, "learning_rate": 8.628481651367875e-08, "loss": 0.0068, "step": 720 }, { "epoch": 4.59, "learning_rate": 7.947809564230445e-08, "loss": 0.0018, "step": 730 }, { "epoch": 4.65, "learning_rate": 7.289996455765748e-08, "loss": 0.0035, "step": 740 }, { "epoch": 4.72, "learning_rate": 6.655924144404906e-08, "loss": 0.0032, "step": 750 }, { "epoch": 4.78, "learning_rate": 6.046442623320145e-08, "loss": 0.0048, "step": 760 }, { "epoch": 4.84, "learning_rate": 5.4623689209832484e-08, "loss": 0.0034, "step": 770 }, { "epoch": 4.91, "learning_rate": 4.904486005914027e-08, "loss": 0.0031, "step": 780 }, { "epoch": 4.97, "learning_rate": 4.373541737087263e-08, "loss": 0.0021, "step": 790 }, { "epoch": 5.0, "eval_loss": 0.06586528569459915, "eval_runtime": 22.2256, "eval_samples_per_second": 9.269, "eval_steps_per_second": 0.405, "step": 795 }, { "epoch": 5.03, "learning_rate": 3.8702478614051345e-08, "loss": 0.0037, "step": 800 }, { "epoch": 5.09, "learning_rate": 3.3952790595787986e-08, "loss": 0.0015, "step": 810 }, { "epoch": 5.16, "learning_rate": 2.9492720416985e-08, "loss": 0.0007, "step": 820 }, { "epoch": 5.22, "learning_rate": 2.5328246937043525e-08, "loss": 0.002, "step": 830 }, { "epoch": 5.28, "learning_rate": 2.1464952759020856e-08, "loss": 0.0005, "step": 840 }, { "epoch": 5.35, "learning_rate": 1.7908016745981856e-08, "loss": 0.0012, "step": 850 }, { "epoch": 5.41, "learning_rate": 1.4662207078575684e-08, "loss": 0.0005, "step": 860 }, { "epoch": 5.47, "learning_rate": 1.1731874863145142e-08, "loss": 0.0024, "step": 870 }, { "epoch": 5.53, "learning_rate": 9.12094829893642e-09, "loss": 0.0011, "step": 880 }, { "epoch": 5.6, "learning_rate": 6.832927412229017e-09, "loss": 0.0006, "step": 890 }, { "epoch": 5.66, "learning_rate": 4.8708793644441086e-09, "loss": 0.0023, "step": 900 }, { "epoch": 5.72, "learning_rate": 3.2374343405217884e-09, "loss": 0.0023, "step": 910 }, { "epoch": 5.79, "learning_rate": 1.9347820230782295e-09, "loss": 0.0005, "step": 920 }, { "epoch": 5.85, "learning_rate": 9.64668657069706e-10, "loss": 0.0023, "step": 930 }, { "epoch": 5.91, "learning_rate": 3.2839470889836627e-10, "loss": 0.0011, "step": 940 }, { "epoch": 5.97, "learning_rate": 2.6813123097352287e-11, "loss": 0.0008, "step": 950 }, { "epoch": 6.0, "eval_loss": 0.07076797634363174, "eval_runtime": 21.958, "eval_samples_per_second": 9.382, "eval_steps_per_second": 0.41, "step": 954 }, { "epoch": 6.0, "step": 954, "total_flos": 126473593159680.0, "train_loss": 0.18272512886366676, "train_runtime": 9319.8502, "train_samples_per_second": 2.443, "train_steps_per_second": 0.102 } ], "logging_steps": 10, "max_steps": 954, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "total_flos": 126473593159680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }