{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 10580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1890359168241966, "grad_norm": 0.2537194490432739, "learning_rate": 5e-05, "loss": 1.9783, "step": 100 }, { "epoch": 0.3780718336483932, "grad_norm": 0.3122023940086365, "learning_rate": 5e-05, "loss": 1.8915, "step": 200 }, { "epoch": 0.5671077504725898, "grad_norm": 0.2867104113101959, "learning_rate": 5e-05, "loss": 1.8649, "step": 300 }, { "epoch": 0.7561436672967864, "grad_norm": 0.33248788118362427, "learning_rate": 5e-05, "loss": 1.8688, "step": 400 }, { "epoch": 0.945179584120983, "grad_norm": 0.3251153230667114, "learning_rate": 5e-05, "loss": 1.8478, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.6079730941704036, "eval_loss": 1.6698505878448486, "eval_runtime": 5.9262, "eval_samples_per_second": 84.371, "eval_steps_per_second": 10.631, "step": 529 }, { "epoch": 1.1342155009451795, "grad_norm": 0.421694278717041, "learning_rate": 5e-05, "loss": 1.8373, "step": 600 }, { "epoch": 1.3232514177693762, "grad_norm": 0.5188952088356018, "learning_rate": 5e-05, "loss": 1.7876, "step": 700 }, { "epoch": 1.5122873345935728, "grad_norm": 0.5327262282371521, "learning_rate": 5e-05, "loss": 1.7863, "step": 800 }, { "epoch": 1.7013232514177694, "grad_norm": 0.5316148400306702, "learning_rate": 5e-05, "loss": 1.7781, "step": 900 }, { "epoch": 1.8903591682419658, "grad_norm": 0.5517605543136597, "learning_rate": 5e-05, "loss": 1.7862, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.6164035874439462, "eval_loss": 1.600331425666809, "eval_runtime": 6.1626, "eval_samples_per_second": 81.135, "eval_steps_per_second": 10.223, "step": 1058 }, { "epoch": 2.0793950850661624, "grad_norm": 0.584515392780304, "learning_rate": 5e-05, "loss": 1.7382, "step": 1100 }, { "epoch": 2.268431001890359, "grad_norm": 0.7216724157333374, "learning_rate": 5e-05, "loss": 1.6959, "step": 1200 }, { "epoch": 2.4574669187145557, "grad_norm": 0.7533963918685913, "learning_rate": 5e-05, "loss": 1.6826, "step": 1300 }, { "epoch": 2.6465028355387523, "grad_norm": 0.766959011554718, "learning_rate": 5e-05, "loss": 1.6754, "step": 1400 }, { "epoch": 2.835538752362949, "grad_norm": 0.7920446395874023, "learning_rate": 5e-05, "loss": 1.6531, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.6250941704035874, "eval_loss": 1.5362746715545654, "eval_runtime": 6.1771, "eval_samples_per_second": 80.945, "eval_steps_per_second": 10.199, "step": 1587 }, { "epoch": 3.0245746691871456, "grad_norm": 0.7897095084190369, "learning_rate": 5e-05, "loss": 1.634, "step": 1600 }, { "epoch": 3.213610586011342, "grad_norm": 0.8669045567512512, "learning_rate": 5e-05, "loss": 1.5462, "step": 1700 }, { "epoch": 3.402646502835539, "grad_norm": 0.9692543745040894, "learning_rate": 5e-05, "loss": 1.5512, "step": 1800 }, { "epoch": 3.5916824196597354, "grad_norm": 1.0659756660461426, "learning_rate": 5e-05, "loss": 1.5494, "step": 1900 }, { "epoch": 3.780718336483932, "grad_norm": 0.9194815158843994, "learning_rate": 5e-05, "loss": 1.5615, "step": 2000 }, { "epoch": 3.9697542533081287, "grad_norm": 0.9011955857276917, "learning_rate": 5e-05, "loss": 1.5515, "step": 2100 }, { "epoch": 4.0, "eval_accuracy": 0.6343408071748879, "eval_loss": 1.460845708847046, "eval_runtime": 7.046, "eval_samples_per_second": 70.962, "eval_steps_per_second": 8.941, "step": 2116 }, { "epoch": 4.158790170132325, "grad_norm": 1.1321868896484375, "learning_rate": 5e-05, "loss": 1.4492, "step": 2200 }, { "epoch": 4.3478260869565215, "grad_norm": 1.1087547540664673, "learning_rate": 5e-05, "loss": 1.4271, "step": 2300 }, { "epoch": 4.536862003780718, "grad_norm": 1.0516396760940552, "learning_rate": 5e-05, "loss": 1.412, "step": 2400 }, { "epoch": 4.725897920604915, "grad_norm": 1.1536446809768677, "learning_rate": 5e-05, "loss": 1.4104, "step": 2500 }, { "epoch": 4.914933837429111, "grad_norm": 1.0451328754425049, "learning_rate": 5e-05, "loss": 1.4038, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.6456233183856502, "eval_loss": 1.3876169919967651, "eval_runtime": 6.1935, "eval_samples_per_second": 80.73, "eval_steps_per_second": 10.172, "step": 2645 }, { "epoch": 5.103969754253308, "grad_norm": 1.1491235494613647, "learning_rate": 5e-05, "loss": 1.3354, "step": 2700 }, { "epoch": 5.293005671077505, "grad_norm": 1.222063422203064, "learning_rate": 5e-05, "loss": 1.2777, "step": 2800 }, { "epoch": 5.482041587901701, "grad_norm": 1.2400953769683838, "learning_rate": 5e-05, "loss": 1.2798, "step": 2900 }, { "epoch": 5.671077504725898, "grad_norm": 1.3202099800109863, "learning_rate": 5e-05, "loss": 1.2817, "step": 3000 }, { "epoch": 5.8601134215500945, "grad_norm": 1.4666662216186523, "learning_rate": 5e-05, "loss": 1.2751, "step": 3100 }, { "epoch": 6.0, "eval_accuracy": 0.6552914798206279, "eval_loss": 1.3185516595840454, "eval_runtime": 6.5131, "eval_samples_per_second": 76.768, "eval_steps_per_second": 9.673, "step": 3174 }, { "epoch": 6.049149338374291, "grad_norm": 1.4949547052383423, "learning_rate": 5e-05, "loss": 1.2454, "step": 3200 }, { "epoch": 6.238185255198488, "grad_norm": 1.3395828008651733, "learning_rate": 5e-05, "loss": 1.1458, "step": 3300 }, { "epoch": 6.427221172022684, "grad_norm": 1.3782073259353638, "learning_rate": 5e-05, "loss": 1.1494, "step": 3400 }, { "epoch": 6.616257088846881, "grad_norm": 1.5405601263046265, "learning_rate": 5e-05, "loss": 1.1666, "step": 3500 }, { "epoch": 6.805293005671078, "grad_norm": 1.455475926399231, "learning_rate": 5e-05, "loss": 1.1447, "step": 3600 }, { "epoch": 6.994328922495274, "grad_norm": 1.4599090814590454, "learning_rate": 5e-05, "loss": 1.1475, "step": 3700 }, { "epoch": 7.0, "eval_accuracy": 0.6637130044843049, "eval_loss": 1.2514457702636719, "eval_runtime": 6.3374, "eval_samples_per_second": 78.896, "eval_steps_per_second": 9.941, "step": 3703 }, { "epoch": 7.183364839319471, "grad_norm": 1.7142447233200073, "learning_rate": 5e-05, "loss": 1.0271, "step": 3800 }, { "epoch": 7.3724007561436675, "grad_norm": 1.6401056051254272, "learning_rate": 5e-05, "loss": 1.0335, "step": 3900 }, { "epoch": 7.561436672967864, "grad_norm": 1.576233148574829, "learning_rate": 5e-05, "loss": 1.0275, "step": 4000 }, { "epoch": 7.750472589792061, "grad_norm": 1.6928788423538208, "learning_rate": 5e-05, "loss": 1.0088, "step": 4100 }, { "epoch": 7.939508506616257, "grad_norm": 1.960358738899231, "learning_rate": 5e-05, "loss": 1.0282, "step": 4200 }, { "epoch": 8.0, "eval_accuracy": 0.676, "eval_loss": 1.1740168333053589, "eval_runtime": 5.8382, "eval_samples_per_second": 85.643, "eval_steps_per_second": 10.791, "step": 4232 }, { "epoch": 8.128544423440454, "grad_norm": 1.8560665845870972, "learning_rate": 5e-05, "loss": 0.9459, "step": 4300 }, { "epoch": 8.31758034026465, "grad_norm": 1.623260498046875, "learning_rate": 5e-05, "loss": 0.8942, "step": 4400 }, { "epoch": 8.506616257088847, "grad_norm": 1.7087422609329224, "learning_rate": 5e-05, "loss": 0.9248, "step": 4500 }, { "epoch": 8.695652173913043, "grad_norm": 1.7795131206512451, "learning_rate": 5e-05, "loss": 0.9064, "step": 4600 }, { "epoch": 8.88468809073724, "grad_norm": 2.175858497619629, "learning_rate": 5e-05, "loss": 0.9067, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.6869865470852018, "eval_loss": 1.1004419326782227, "eval_runtime": 6.7202, "eval_samples_per_second": 74.402, "eval_steps_per_second": 9.375, "step": 4761 }, { "epoch": 9.073724007561436, "grad_norm": 1.9058854579925537, "learning_rate": 5e-05, "loss": 0.8774, "step": 4800 }, { "epoch": 9.262759924385634, "grad_norm": 1.9498395919799805, "learning_rate": 5e-05, "loss": 0.7785, "step": 4900 }, { "epoch": 9.45179584120983, "grad_norm": 2.0595734119415283, "learning_rate": 5e-05, "loss": 0.7948, "step": 5000 }, { "epoch": 9.640831758034027, "grad_norm": 2.126279592514038, "learning_rate": 5e-05, "loss": 0.8019, "step": 5100 }, { "epoch": 9.829867674858223, "grad_norm": 2.0013153553009033, "learning_rate": 5e-05, "loss": 0.8202, "step": 5200 }, { "epoch": 10.0, "eval_accuracy": 0.6964484304932735, "eval_loss": 1.040834665298462, "eval_runtime": 6.8934, "eval_samples_per_second": 72.533, "eval_steps_per_second": 9.139, "step": 5290 }, { "epoch": 10.01890359168242, "grad_norm": 1.985369324684143, "learning_rate": 5e-05, "loss": 0.7969, "step": 5300 }, { "epoch": 10.207939508506616, "grad_norm": 2.0791215896606445, "learning_rate": 5e-05, "loss": 0.7051, "step": 5400 }, { "epoch": 10.396975425330814, "grad_norm": 1.9755679368972778, "learning_rate": 5e-05, "loss": 0.6925, "step": 5500 }, { "epoch": 10.58601134215501, "grad_norm": 2.2225089073181152, "learning_rate": 5e-05, "loss": 0.7082, "step": 5600 }, { "epoch": 10.775047258979207, "grad_norm": 2.0549418926239014, "learning_rate": 5e-05, "loss": 0.7139, "step": 5700 }, { "epoch": 10.964083175803403, "grad_norm": 2.1886773109436035, "learning_rate": 5e-05, "loss": 0.7007, "step": 5800 }, { "epoch": 11.0, "eval_accuracy": 0.7083677130044843, "eval_loss": 0.9591866135597229, "eval_runtime": 6.2581, "eval_samples_per_second": 79.896, "eval_steps_per_second": 10.067, "step": 5819 }, { "epoch": 11.1531190926276, "grad_norm": 2.0527970790863037, "learning_rate": 5e-05, "loss": 0.6231, "step": 5900 }, { "epoch": 11.342155009451796, "grad_norm": 2.2655019760131836, "learning_rate": 5e-05, "loss": 0.6101, "step": 6000 }, { "epoch": 11.531190926275993, "grad_norm": 2.3228235244750977, "learning_rate": 5e-05, "loss": 0.6231, "step": 6100 }, { "epoch": 11.720226843100189, "grad_norm": 2.1348612308502197, "learning_rate": 5e-05, "loss": 0.6226, "step": 6200 }, { "epoch": 11.909262759924385, "grad_norm": 2.008643865585327, "learning_rate": 5e-05, "loss": 0.6259, "step": 6300 }, { "epoch": 12.0, "eval_accuracy": 0.7190941704035875, "eval_loss": 0.8998307585716248, "eval_runtime": 6.5934, "eval_samples_per_second": 75.834, "eval_steps_per_second": 9.555, "step": 6348 }, { "epoch": 12.098298676748582, "grad_norm": 1.998384714126587, "learning_rate": 5e-05, "loss": 0.5586, "step": 6400 }, { "epoch": 12.287334593572778, "grad_norm": 2.522263526916504, "learning_rate": 5e-05, "loss": 0.5371, "step": 6500 }, { "epoch": 12.476370510396976, "grad_norm": 2.5135159492492676, "learning_rate": 5e-05, "loss": 0.5468, "step": 6600 }, { "epoch": 12.665406427221171, "grad_norm": 2.0871999263763428, "learning_rate": 5e-05, "loss": 0.5516, "step": 6700 }, { "epoch": 12.854442344045369, "grad_norm": 2.2135109901428223, "learning_rate": 5e-05, "loss": 0.553, "step": 6800 }, { "epoch": 13.0, "eval_accuracy": 0.7294708520179373, "eval_loss": 0.8331915140151978, "eval_runtime": 6.5807, "eval_samples_per_second": 75.98, "eval_steps_per_second": 9.573, "step": 6877 }, { "epoch": 13.043478260869565, "grad_norm": 2.2245984077453613, "learning_rate": 5e-05, "loss": 0.5368, "step": 6900 }, { "epoch": 13.232514177693762, "grad_norm": 1.9655486345291138, "learning_rate": 5e-05, "loss": 0.4653, "step": 7000 }, { "epoch": 13.421550094517958, "grad_norm": 2.403517007827759, "learning_rate": 5e-05, "loss": 0.4686, "step": 7100 }, { "epoch": 13.610586011342155, "grad_norm": 2.637559652328491, "learning_rate": 5e-05, "loss": 0.4875, "step": 7200 }, { "epoch": 13.799621928166351, "grad_norm": 2.706881523132324, "learning_rate": 5e-05, "loss": 0.4882, "step": 7300 }, { "epoch": 13.988657844990549, "grad_norm": 2.1948862075805664, "learning_rate": 5e-05, "loss": 0.4948, "step": 7400 }, { "epoch": 14.0, "eval_accuracy": 0.7387443946188341, "eval_loss": 0.7799230217933655, "eval_runtime": 6.562, "eval_samples_per_second": 76.196, "eval_steps_per_second": 9.601, "step": 7406 }, { "epoch": 14.177693761814744, "grad_norm": 2.3989648818969727, "learning_rate": 5e-05, "loss": 0.4209, "step": 7500 }, { "epoch": 14.366729678638942, "grad_norm": 2.2252283096313477, "learning_rate": 5e-05, "loss": 0.4168, "step": 7600 }, { "epoch": 14.555765595463138, "grad_norm": 2.0487372875213623, "learning_rate": 5e-05, "loss": 0.4262, "step": 7700 }, { "epoch": 14.744801512287335, "grad_norm": 2.0398826599121094, "learning_rate": 5e-05, "loss": 0.4411, "step": 7800 }, { "epoch": 14.93383742911153, "grad_norm": 2.3320512771606445, "learning_rate": 5e-05, "loss": 0.4221, "step": 7900 }, { "epoch": 15.0, "eval_accuracy": 0.7465650224215247, "eval_loss": 0.7330228686332703, "eval_runtime": 6.2463, "eval_samples_per_second": 80.048, "eval_steps_per_second": 10.086, "step": 7935 }, { "epoch": 15.122873345935728, "grad_norm": 2.58713698387146, "learning_rate": 5e-05, "loss": 0.3905, "step": 8000 }, { "epoch": 15.311909262759924, "grad_norm": 2.578805685043335, "learning_rate": 5e-05, "loss": 0.3697, "step": 8100 }, { "epoch": 15.500945179584122, "grad_norm": 2.377256155014038, "learning_rate": 5e-05, "loss": 0.3753, "step": 8200 }, { "epoch": 15.689981096408317, "grad_norm": 2.177788734436035, "learning_rate": 5e-05, "loss": 0.3808, "step": 8300 }, { "epoch": 15.879017013232515, "grad_norm": 2.4307467937469482, "learning_rate": 5e-05, "loss": 0.3911, "step": 8400 }, { "epoch": 16.0, "eval_accuracy": 0.7551031390134529, "eval_loss": 0.6804938912391663, "eval_runtime": 6.6537, "eval_samples_per_second": 75.146, "eval_steps_per_second": 9.468, "step": 8464 }, { "epoch": 16.068052930056712, "grad_norm": 2.087111711502075, "learning_rate": 5e-05, "loss": 0.3663, "step": 8500 }, { "epoch": 16.257088846880908, "grad_norm": 2.232668876647949, "learning_rate": 5e-05, "loss": 0.3279, "step": 8600 }, { "epoch": 16.446124763705104, "grad_norm": 2.0968847274780273, "learning_rate": 5e-05, "loss": 0.337, "step": 8700 }, { "epoch": 16.6351606805293, "grad_norm": 2.2750778198242188, "learning_rate": 5e-05, "loss": 0.3487, "step": 8800 }, { "epoch": 16.8241965973535, "grad_norm": 2.314293622970581, "learning_rate": 5e-05, "loss": 0.3377, "step": 8900 }, { "epoch": 17.0, "eval_accuracy": 0.7619730941704036, "eval_loss": 0.6475254893302917, "eval_runtime": 6.8675, "eval_samples_per_second": 72.806, "eval_steps_per_second": 9.174, "step": 8993 }, { "epoch": 17.013232514177695, "grad_norm": 1.7701884508132935, "learning_rate": 5e-05, "loss": 0.345, "step": 9000 }, { "epoch": 17.20226843100189, "grad_norm": 2.2703754901885986, "learning_rate": 5e-05, "loss": 0.291, "step": 9100 }, { "epoch": 17.391304347826086, "grad_norm": 2.4970645904541016, "learning_rate": 5e-05, "loss": 0.2978, "step": 9200 }, { "epoch": 17.58034026465028, "grad_norm": 3.104184865951538, "learning_rate": 5e-05, "loss": 0.3045, "step": 9300 }, { "epoch": 17.76937618147448, "grad_norm": 2.147677183151245, "learning_rate": 5e-05, "loss": 0.3165, "step": 9400 }, { "epoch": 17.958412098298677, "grad_norm": 2.1485955715179443, "learning_rate": 5e-05, "loss": 0.3179, "step": 9500 }, { "epoch": 18.0, "eval_accuracy": 0.7679730941704036, "eval_loss": 0.6194583773612976, "eval_runtime": 7.1412, "eval_samples_per_second": 70.016, "eval_steps_per_second": 8.822, "step": 9522 }, { "epoch": 18.147448015122873, "grad_norm": 2.142909049987793, "learning_rate": 5e-05, "loss": 0.2745, "step": 9600 }, { "epoch": 18.33648393194707, "grad_norm": 2.3208634853363037, "learning_rate": 5e-05, "loss": 0.2708, "step": 9700 }, { "epoch": 18.525519848771268, "grad_norm": 2.5693838596343994, "learning_rate": 5e-05, "loss": 0.2764, "step": 9800 }, { "epoch": 18.714555765595463, "grad_norm": 2.5446975231170654, "learning_rate": 5e-05, "loss": 0.2786, "step": 9900 }, { "epoch": 18.90359168241966, "grad_norm": 2.087402582168579, "learning_rate": 5e-05, "loss": 0.288, "step": 10000 }, { "epoch": 19.0, "eval_accuracy": 0.7723318385650224, "eval_loss": 0.5961839556694031, "eval_runtime": 6.2662, "eval_samples_per_second": 79.793, "eval_steps_per_second": 10.054, "step": 10051 }, { "epoch": 19.092627599243855, "grad_norm": 1.9526759386062622, "learning_rate": 5e-05, "loss": 0.2627, "step": 10100 }, { "epoch": 19.281663516068054, "grad_norm": 1.8512705564498901, "learning_rate": 5e-05, "loss": 0.2401, "step": 10200 }, { "epoch": 19.47069943289225, "grad_norm": 2.320798635482788, "learning_rate": 5e-05, "loss": 0.2511, "step": 10300 }, { "epoch": 19.659735349716446, "grad_norm": 2.2155604362487793, "learning_rate": 5e-05, "loss": 0.2601, "step": 10400 }, { "epoch": 19.84877126654064, "grad_norm": 2.0877740383148193, "learning_rate": 5e-05, "loss": 0.2605, "step": 10500 }, { "epoch": 20.0, "eval_accuracy": 0.7753632286995515, "eval_loss": 0.5804058909416199, "eval_runtime": 6.6016, "eval_samples_per_second": 75.74, "eval_steps_per_second": 9.543, "step": 10580 }, { "epoch": 20.0, "step": 10580, "total_flos": 9.64242245391745e+17, "train_loss": 0.8903949523017176, "train_runtime": 23422.0707, "train_samples_per_second": 14.452, "train_steps_per_second": 0.452 } ], "logging_steps": 100, "max_steps": 10580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 9.64242245391745e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }