{ "best_metric": null, "best_model_checkpoint": null, "epoch": 25.778732545649838, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.34, "grad_norm": 0.6707318425178528, "learning_rate": 1.97816091954023e-05, "loss": 2.2888, "step": 20 }, { "epoch": 0.69, "grad_norm": NaN, "learning_rate": 1.9574712643678162e-05, "loss": 2.0848, "step": 40 }, { "epoch": 1.03, "grad_norm": 0.7721680402755737, "learning_rate": 1.9344827586206897e-05, "loss": 2.0044, "step": 60 }, { "epoch": 1.37, "grad_norm": 1.1140433549880981, "learning_rate": 1.9126436781609195e-05, "loss": 1.8016, "step": 80 }, { "epoch": 1.72, "grad_norm": 0.7205075621604919, "learning_rate": 1.8896551724137934e-05, "loss": 1.7217, "step": 100 }, { "epoch": 2.06, "grad_norm": 0.8933233618736267, "learning_rate": 1.866666666666667e-05, "loss": 1.5705, "step": 120 }, { "epoch": 2.41, "grad_norm": 0.7114273905754089, "learning_rate": 1.8436781609195404e-05, "loss": 1.4006, "step": 140 }, { "epoch": 2.75, "grad_norm": 0.7229479551315308, "learning_rate": 1.820689655172414e-05, "loss": 1.3137, "step": 160 }, { "epoch": 3.09, "grad_norm": 0.9370490908622742, "learning_rate": 1.7977011494252874e-05, "loss": 1.1898, "step": 180 }, { "epoch": 3.44, "grad_norm": 0.6051978468894958, "learning_rate": 1.774712643678161e-05, "loss": 1.1229, "step": 200 }, { "epoch": 3.78, "grad_norm": 0.6857028007507324, "learning_rate": 1.7517241379310347e-05, "loss": 1.051, "step": 220 }, { "epoch": 4.12, "grad_norm": 0.6715748310089111, "learning_rate": 1.7287356321839082e-05, "loss": 0.9894, "step": 240 }, { "epoch": 4.47, "grad_norm": 0.5918118953704834, "learning_rate": 1.7057471264367817e-05, "loss": 0.9687, "step": 260 }, { "epoch": 4.81, "grad_norm": 0.6621690392494202, "learning_rate": 1.6827586206896552e-05, "loss": 0.9199, "step": 280 }, { "epoch": 5.16, "grad_norm": 0.6697206497192383, "learning_rate": 1.659770114942529e-05, "loss": 0.9303, "step": 300 }, { "epoch": 5.5, "grad_norm": 0.8184316158294678, "learning_rate": 1.6367816091954025e-05, "loss": 0.8898, "step": 320 }, { "epoch": 5.84, "grad_norm": 0.6429987549781799, "learning_rate": 1.613793103448276e-05, "loss": 0.8623, "step": 340 }, { "epoch": 6.19, "grad_norm": 0.7518043518066406, "learning_rate": 1.5908045977011495e-05, "loss": 0.8239, "step": 360 }, { "epoch": 6.53, "grad_norm": 0.6667824983596802, "learning_rate": 1.567816091954023e-05, "loss": 0.8119, "step": 380 }, { "epoch": 6.87, "grad_norm": 0.8569457530975342, "learning_rate": 1.5448275862068965e-05, "loss": 0.8139, "step": 400 }, { "epoch": 7.22, "grad_norm": 0.7754850387573242, "learning_rate": 1.5218390804597702e-05, "loss": 0.7835, "step": 420 }, { "epoch": 7.56, "grad_norm": 1.159196138381958, "learning_rate": 1.4988505747126439e-05, "loss": 0.7546, "step": 440 }, { "epoch": 7.91, "grad_norm": 1.119764804840088, "learning_rate": 1.4758620689655174e-05, "loss": 0.7571, "step": 460 }, { "epoch": 8.25, "grad_norm": 1.3600786924362183, "learning_rate": 1.452873563218391e-05, "loss": 0.7451, "step": 480 }, { "epoch": 8.59, "grad_norm": 0.7608994245529175, "learning_rate": 1.4298850574712644e-05, "loss": 0.7109, "step": 500 }, { "epoch": 8.94, "grad_norm": 1.0172290802001953, "learning_rate": 1.406896551724138e-05, "loss": 0.7228, "step": 520 }, { "epoch": 9.28, "grad_norm": 1.042607069015503, "learning_rate": 1.3839080459770115e-05, "loss": 0.6939, "step": 540 }, { "epoch": 9.62, "grad_norm": 0.8913071751594543, "learning_rate": 1.3609195402298852e-05, "loss": 0.6721, "step": 560 }, { "epoch": 9.97, "grad_norm": 1.4283536672592163, "learning_rate": 1.3379310344827587e-05, "loss": 0.681, "step": 580 }, { "epoch": 10.31, "grad_norm": 1.1445728540420532, "learning_rate": 1.3149425287356324e-05, "loss": 0.6484, "step": 600 }, { "epoch": 10.66, "grad_norm": 1.425697684288025, "learning_rate": 1.2919540229885059e-05, "loss": 0.6558, "step": 620 }, { "epoch": 11.0, "grad_norm": 0.8931305408477783, "learning_rate": 1.2689655172413795e-05, "loss": 0.6642, "step": 640 }, { "epoch": 11.34, "grad_norm": 1.0374151468276978, "learning_rate": 1.2459770114942529e-05, "loss": 0.6202, "step": 660 }, { "epoch": 11.69, "grad_norm": 1.628758430480957, "learning_rate": 1.2229885057471265e-05, "loss": 0.6163, "step": 680 }, { "epoch": 12.03, "grad_norm": 1.3881452083587646, "learning_rate": 1.2e-05, "loss": 0.6364, "step": 700 }, { "epoch": 12.37, "grad_norm": 1.0961302518844604, "learning_rate": 1.1770114942528737e-05, "loss": 0.5963, "step": 720 }, { "epoch": 12.72, "grad_norm": 1.1812736988067627, "learning_rate": 1.1540229885057472e-05, "loss": 0.6102, "step": 740 }, { "epoch": 13.06, "grad_norm": 1.103151559829712, "learning_rate": 1.1310344827586209e-05, "loss": 0.5965, "step": 760 }, { "epoch": 13.4, "grad_norm": 1.108560562133789, "learning_rate": 1.1080459770114944e-05, "loss": 0.58, "step": 780 }, { "epoch": 13.75, "grad_norm": 1.322364091873169, "learning_rate": 1.085057471264368e-05, "loss": 0.5707, "step": 800 }, { "epoch": 14.09, "grad_norm": 1.2036404609680176, "learning_rate": 1.0620689655172414e-05, "loss": 0.5781, "step": 820 }, { "epoch": 14.44, "grad_norm": 1.46902596950531, "learning_rate": 1.039080459770115e-05, "loss": 0.5413, "step": 840 }, { "epoch": 14.78, "grad_norm": 0.9223589301109314, "learning_rate": 1.0160919540229885e-05, "loss": 0.5686, "step": 860 }, { "epoch": 15.12, "grad_norm": 1.7452529668807983, "learning_rate": 9.931034482758622e-06, "loss": 0.5538, "step": 880 }, { "epoch": 15.47, "grad_norm": 1.0680702924728394, "learning_rate": 9.701149425287357e-06, "loss": 0.5402, "step": 900 }, { "epoch": 15.81, "grad_norm": 1.4106242656707764, "learning_rate": 9.471264367816094e-06, "loss": 0.5629, "step": 920 }, { "epoch": 16.15, "grad_norm": 1.7341551780700684, "learning_rate": 9.241379310344829e-06, "loss": 0.5538, "step": 940 }, { "epoch": 16.5, "grad_norm": 2.115643262863159, "learning_rate": 9.011494252873564e-06, "loss": 0.5481, "step": 960 }, { "epoch": 16.84, "grad_norm": 1.1589787006378174, "learning_rate": 8.7816091954023e-06, "loss": 0.4981, "step": 980 }, { "epoch": 17.19, "grad_norm": 1.0696042776107788, "learning_rate": 8.551724137931035e-06, "loss": 0.5041, "step": 1000 }, { "epoch": 17.53, "grad_norm": 1.892269253730774, "learning_rate": 8.32183908045977e-06, "loss": 0.4956, "step": 1020 }, { "epoch": 17.87, "grad_norm": 2.214688301086426, "learning_rate": 8.091954022988507e-06, "loss": 0.5363, "step": 1040 }, { "epoch": 18.22, "grad_norm": 1.271893858909607, "learning_rate": 7.862068965517242e-06, "loss": 0.5195, "step": 1060 }, { "epoch": 18.56, "grad_norm": 0.9383485913276672, "learning_rate": 7.632183908045979e-06, "loss": 0.5002, "step": 1080 }, { "epoch": 18.9, "grad_norm": 1.749745488166809, "learning_rate": 7.402298850574713e-06, "loss": 0.4841, "step": 1100 }, { "epoch": 19.25, "grad_norm": 1.2551345825195312, "learning_rate": 7.172413793103449e-06, "loss": 0.5033, "step": 1120 }, { "epoch": 19.59, "grad_norm": 1.0362412929534912, "learning_rate": 6.9425287356321845e-06, "loss": 0.5138, "step": 1140 }, { "epoch": 19.94, "grad_norm": 1.483361840248108, "learning_rate": 6.71264367816092e-06, "loss": 0.4826, "step": 1160 }, { "epoch": 20.28, "grad_norm": 1.792438268661499, "learning_rate": 6.482758620689655e-06, "loss": 0.4544, "step": 1180 }, { "epoch": 20.62, "grad_norm": 1.1103723049163818, "learning_rate": 6.252873563218391e-06, "loss": 0.5097, "step": 1200 }, { "epoch": 20.97, "grad_norm": 1.3724958896636963, "learning_rate": 6.022988505747127e-06, "loss": 0.4827, "step": 1220 }, { "epoch": 21.31, "grad_norm": 1.9351897239685059, "learning_rate": 5.793103448275863e-06, "loss": 0.4772, "step": 1240 }, { "epoch": 21.65, "grad_norm": 1.3644013404846191, "learning_rate": 5.563218390804598e-06, "loss": 0.485, "step": 1260 }, { "epoch": 22.0, "grad_norm": 1.4544987678527832, "learning_rate": 5.333333333333334e-06, "loss": 0.468, "step": 1280 }, { "epoch": 22.34, "grad_norm": 1.7704741954803467, "learning_rate": 5.1034482758620695e-06, "loss": 0.4653, "step": 1300 }, { "epoch": 22.69, "grad_norm": 1.2633339166641235, "learning_rate": 4.873563218390805e-06, "loss": 0.477, "step": 1320 }, { "epoch": 23.03, "grad_norm": 1.43886399269104, "learning_rate": 4.643678160919541e-06, "loss": 0.4724, "step": 1340 }, { "epoch": 23.37, "grad_norm": 1.1346815824508667, "learning_rate": 4.413793103448276e-06, "loss": 0.4712, "step": 1360 }, { "epoch": 23.72, "grad_norm": 1.289389967918396, "learning_rate": 4.183908045977012e-06, "loss": 0.4661, "step": 1380 }, { "epoch": 24.06, "grad_norm": 1.477211833000183, "learning_rate": 3.954022988505747e-06, "loss": 0.4443, "step": 1400 }, { "epoch": 24.4, "grad_norm": 2.224083185195923, "learning_rate": 3.7241379310344832e-06, "loss": 0.4445, "step": 1420 }, { "epoch": 24.75, "grad_norm": 1.6573207378387451, "learning_rate": 3.4942528735632187e-06, "loss": 0.4756, "step": 1440 }, { "epoch": 25.09, "grad_norm": 2.269866466522217, "learning_rate": 3.2643678160919545e-06, "loss": 0.4535, "step": 1460 }, { "epoch": 25.44, "grad_norm": 1.2528423070907593, "learning_rate": 3.03448275862069e-06, "loss": 0.4881, "step": 1480 }, { "epoch": 25.78, "grad_norm": 1.2892448902130127, "learning_rate": 2.8045977011494257e-06, "loss": 0.4226, "step": 1500 } ], "logging_steps": 20, "max_steps": 1740, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 3.8986916806656e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }