{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 10580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1890359168241966, "grad_norm": 0.45108333230018616, "learning_rate": 0.0005, "loss": 1.9159, "step": 100 }, { "epoch": 0.3780718336483932, "grad_norm": 0.6339304447174072, "learning_rate": 0.0005, "loss": 1.8756, "step": 200 }, { "epoch": 0.5671077504725898, "grad_norm": 0.5402090549468994, "learning_rate": 0.0005, "loss": 1.8511, "step": 300 }, { "epoch": 0.7561436672967864, "grad_norm": 0.43492260575294495, "learning_rate": 0.0005, "loss": 1.8581, "step": 400 }, { "epoch": 0.945179584120983, "grad_norm": 0.5317010879516602, "learning_rate": 0.0005, "loss": 1.8583, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.5726153846153846, "eval_loss": 1.6375669240951538, "eval_runtime": 5.9206, "eval_samples_per_second": 84.451, "eval_steps_per_second": 10.641, "step": 529 }, { "epoch": 1.1342155009451795, "grad_norm": 0.5848847031593323, "learning_rate": 0.0005, "loss": 1.6655, "step": 600 }, { "epoch": 1.3232514177693762, "grad_norm": 0.642599880695343, "learning_rate": 0.0005, "loss": 1.581, "step": 700 }, { "epoch": 1.5122873345935728, "grad_norm": 0.5563580393791199, "learning_rate": 0.0005, "loss": 1.5945, "step": 800 }, { "epoch": 1.7013232514177694, "grad_norm": 0.576246976852417, "learning_rate": 0.0005, "loss": 1.5962, "step": 900 }, { "epoch": 1.8903591682419658, "grad_norm": 0.6526830196380615, "learning_rate": 0.0005, "loss": 1.6329, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.5712820512820512, "eval_loss": 1.6881264448165894, "eval_runtime": 6.002, "eval_samples_per_second": 83.305, "eval_steps_per_second": 10.496, "step": 1058 }, { "epoch": 2.0793950850661624, "grad_norm": 0.614975094795227, "learning_rate": 0.0005, "loss": 1.4729, "step": 1100 }, { "epoch": 2.268431001890359, "grad_norm": 0.7056101560592651, "learning_rate": 0.0005, "loss": 1.2845, "step": 1200 }, { "epoch": 2.4574669187145557, "grad_norm": 0.8371734619140625, "learning_rate": 0.0005, "loss": 1.3214, "step": 1300 }, { "epoch": 2.6465028355387523, "grad_norm": 0.8234328627586365, "learning_rate": 0.0005, "loss": 1.3326, "step": 1400 }, { "epoch": 2.835538752362949, "grad_norm": 0.7623058557510376, "learning_rate": 0.0005, "loss": 1.3464, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.5663076923076923, "eval_loss": 1.8256189823150635, "eval_runtime": 5.9355, "eval_samples_per_second": 84.239, "eval_steps_per_second": 10.614, "step": 1587 }, { "epoch": 3.0245746691871456, "grad_norm": 0.8697965145111084, "learning_rate": 0.0005, "loss": 1.3199, "step": 1600 }, { "epoch": 3.213610586011342, "grad_norm": 0.7469469308853149, "learning_rate": 0.0005, "loss": 1.0191, "step": 1700 }, { "epoch": 3.402646502835539, "grad_norm": 0.8800496459007263, "learning_rate": 0.0005, "loss": 1.0606, "step": 1800 }, { "epoch": 3.5916824196597354, "grad_norm": 0.7127139568328857, "learning_rate": 0.0005, "loss": 1.0988, "step": 1900 }, { "epoch": 3.780718336483932, "grad_norm": 0.9053574204444885, "learning_rate": 0.0005, "loss": 1.1355, "step": 2000 }, { "epoch": 3.9697542533081287, "grad_norm": 0.9217483997344971, "learning_rate": 0.0005, "loss": 1.1624, "step": 2100 }, { "epoch": 4.0, "eval_accuracy": 0.5651794871794872, "eval_loss": 1.9222664833068848, "eval_runtime": 5.7781, "eval_samples_per_second": 86.533, "eval_steps_per_second": 10.903, "step": 2116 }, { "epoch": 4.158790170132325, "grad_norm": 0.8502592444419861, "learning_rate": 0.0005, "loss": 0.8795, "step": 2200 }, { "epoch": 4.3478260869565215, "grad_norm": 0.8051011562347412, "learning_rate": 0.0005, "loss": 0.8713, "step": 2300 }, { "epoch": 4.536862003780718, "grad_norm": 1.0111184120178223, "learning_rate": 0.0005, "loss": 0.8974, "step": 2400 }, { "epoch": 4.725897920604915, "grad_norm": 0.9804710745811462, "learning_rate": 0.0005, "loss": 0.9396, "step": 2500 }, { "epoch": 4.914933837429111, "grad_norm": 1.1629979610443115, "learning_rate": 0.0005, "loss": 0.964, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.5642564102564103, "eval_loss": 1.9719713926315308, "eval_runtime": 6.1201, "eval_samples_per_second": 81.698, "eval_steps_per_second": 10.294, "step": 2645 }, { "epoch": 5.103969754253308, "grad_norm": 0.8145145177841187, "learning_rate": 0.0005, "loss": 0.8208, "step": 2700 }, { "epoch": 5.293005671077505, "grad_norm": 0.7975667715072632, "learning_rate": 0.0005, "loss": 0.7022, "step": 2800 }, { "epoch": 5.482041587901701, "grad_norm": 0.8971696496009827, "learning_rate": 0.0005, "loss": 0.7435, "step": 2900 }, { "epoch": 5.671077504725898, "grad_norm": 1.0234569311141968, "learning_rate": 0.0005, "loss": 0.7798, "step": 3000 }, { "epoch": 5.8601134215500945, "grad_norm": 1.0058211088180542, "learning_rate": 0.0005, "loss": 0.8117, "step": 3100 }, { "epoch": 6.0, "eval_accuracy": 0.5647179487179487, "eval_loss": 2.001645565032959, "eval_runtime": 6.1282, "eval_samples_per_second": 81.59, "eval_steps_per_second": 10.28, "step": 3174 }, { "epoch": 6.049149338374291, "grad_norm": 0.9231036901473999, "learning_rate": 0.0005, "loss": 0.7673, "step": 3200 }, { "epoch": 6.238185255198488, "grad_norm": 0.9058907628059387, "learning_rate": 0.0005, "loss": 0.5828, "step": 3300 }, { "epoch": 6.427221172022684, "grad_norm": 0.8709693551063538, "learning_rate": 0.0005, "loss": 0.6272, "step": 3400 }, { "epoch": 6.616257088846881, "grad_norm": 0.9873590469360352, "learning_rate": 0.0005, "loss": 0.6599, "step": 3500 }, { "epoch": 6.805293005671078, "grad_norm": 0.9257177114486694, "learning_rate": 0.0005, "loss": 0.6918, "step": 3600 }, { "epoch": 6.994328922495274, "grad_norm": 1.0284956693649292, "learning_rate": 0.0005, "loss": 0.7242, "step": 3700 }, { "epoch": 7.0, "eval_accuracy": 0.5638974358974359, "eval_loss": 2.0784664154052734, "eval_runtime": 6.1613, "eval_samples_per_second": 81.152, "eval_steps_per_second": 10.225, "step": 3703 }, { "epoch": 7.183364839319471, "grad_norm": 1.0036342144012451, "learning_rate": 0.0005, "loss": 0.5099, "step": 3800 }, { "epoch": 7.3724007561436675, "grad_norm": 0.9070968627929688, "learning_rate": 0.0005, "loss": 0.541, "step": 3900 }, { "epoch": 7.561436672967864, "grad_norm": 1.035317063331604, "learning_rate": 0.0005, "loss": 0.5681, "step": 4000 }, { "epoch": 7.750472589792061, "grad_norm": 1.0337880849838257, "learning_rate": 0.0005, "loss": 0.5991, "step": 4100 }, { "epoch": 7.939508506616257, "grad_norm": 0.9707123637199402, "learning_rate": 0.0005, "loss": 0.6381, "step": 4200 }, { "epoch": 8.0, "eval_accuracy": 0.5644615384615385, "eval_loss": 2.0954012870788574, "eval_runtime": 5.9732, "eval_samples_per_second": 83.708, "eval_steps_per_second": 10.547, "step": 4232 }, { "epoch": 8.128544423440454, "grad_norm": 0.9024428129196167, "learning_rate": 0.0005, "loss": 0.5113, "step": 4300 }, { "epoch": 8.31758034026465, "grad_norm": 0.8392314314842224, "learning_rate": 0.0005, "loss": 0.4752, "step": 4400 }, { "epoch": 8.506616257088847, "grad_norm": 0.9310020208358765, "learning_rate": 0.0005, "loss": 0.5088, "step": 4500 }, { "epoch": 8.695652173913043, "grad_norm": 1.115989327430725, "learning_rate": 0.0005, "loss": 0.5358, "step": 4600 }, { "epoch": 8.88468809073724, "grad_norm": 1.1282142400741577, "learning_rate": 0.0005, "loss": 0.573, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.5623076923076923, "eval_loss": 2.106734275817871, "eval_runtime": 5.9846, "eval_samples_per_second": 83.548, "eval_steps_per_second": 10.527, "step": 4761 }, { "epoch": 9.073724007561436, "grad_norm": 0.7718724012374878, "learning_rate": 0.0005, "loss": 0.5165, "step": 4800 }, { "epoch": 9.262759924385634, "grad_norm": 1.0482347011566162, "learning_rate": 0.0005, "loss": 0.4291, "step": 4900 }, { "epoch": 9.45179584120983, "grad_norm": 1.1258679628372192, "learning_rate": 0.0005, "loss": 0.467, "step": 5000 }, { "epoch": 9.640831758034027, "grad_norm": 1.0340867042541504, "learning_rate": 0.0005, "loss": 0.4992, "step": 5100 }, { "epoch": 9.829867674858223, "grad_norm": 0.9999015927314758, "learning_rate": 0.0005, "loss": 0.5269, "step": 5200 }, { "epoch": 10.0, "eval_accuracy": 0.5646153846153846, "eval_loss": 2.1355865001678467, "eval_runtime": 5.6591, "eval_samples_per_second": 88.353, "eval_steps_per_second": 11.132, "step": 5290 }, { "epoch": 10.01890359168242, "grad_norm": 0.8213431239128113, "learning_rate": 0.0005, "loss": 0.5325, "step": 5300 }, { "epoch": 10.207939508506616, "grad_norm": 1.023992896080017, "learning_rate": 0.0005, "loss": 0.3966, "step": 5400 }, { "epoch": 10.396975425330814, "grad_norm": 0.8211169242858887, "learning_rate": 0.0005, "loss": 0.4307, "step": 5500 }, { "epoch": 10.58601134215501, "grad_norm": 0.9888399839401245, "learning_rate": 0.0005, "loss": 0.4604, "step": 5600 }, { "epoch": 10.775047258979207, "grad_norm": 0.9565384387969971, "learning_rate": 0.0005, "loss": 0.4888, "step": 5700 }, { "epoch": 10.964083175803403, "grad_norm": 1.0463844537734985, "learning_rate": 0.0005, "loss": 0.5144, "step": 5800 }, { "epoch": 11.0, "eval_accuracy": 0.5616410256410257, "eval_loss": 2.195107936859131, "eval_runtime": 5.722, "eval_samples_per_second": 87.382, "eval_steps_per_second": 11.01, "step": 5819 }, { "epoch": 11.1531190926276, "grad_norm": 0.9159551858901978, "learning_rate": 0.0005, "loss": 0.4094, "step": 5900 }, { "epoch": 11.342155009451796, "grad_norm": 0.9027566909790039, "learning_rate": 0.0005, "loss": 0.4083, "step": 6000 }, { "epoch": 11.531190926275993, "grad_norm": 0.9325262904167175, "learning_rate": 0.0005, "loss": 0.4339, "step": 6100 }, { "epoch": 11.720226843100189, "grad_norm": 1.0173691511154175, "learning_rate": 0.0005, "loss": 0.464, "step": 6200 }, { "epoch": 11.909262759924385, "grad_norm": 0.9742277264595032, "learning_rate": 0.0005, "loss": 0.4887, "step": 6300 }, { "epoch": 12.0, "eval_accuracy": 0.5631282051282052, "eval_loss": 2.1778857707977295, "eval_runtime": 6.0538, "eval_samples_per_second": 82.593, "eval_steps_per_second": 10.407, "step": 6348 }, { "epoch": 12.098298676748582, "grad_norm": 1.0347404479980469, "learning_rate": 0.0005, "loss": 0.438, "step": 6400 }, { "epoch": 12.287334593572778, "grad_norm": 0.8733549118041992, "learning_rate": 0.0005, "loss": 0.3893, "step": 6500 }, { "epoch": 12.476370510396976, "grad_norm": 0.8693263530731201, "learning_rate": 0.0005, "loss": 0.4142, "step": 6600 }, { "epoch": 12.665406427221171, "grad_norm": 1.1597702503204346, "learning_rate": 0.0005, "loss": 0.4379, "step": 6700 }, { "epoch": 12.854442344045369, "grad_norm": 1.018873929977417, "learning_rate": 0.0005, "loss": 0.4636, "step": 6800 }, { "epoch": 13.0, "eval_accuracy": 0.561076923076923, "eval_loss": 2.175729274749756, "eval_runtime": 5.683, "eval_samples_per_second": 87.981, "eval_steps_per_second": 11.086, "step": 6877 }, { "epoch": 13.043478260869565, "grad_norm": 0.8772552013397217, "learning_rate": 0.0005, "loss": 0.4552, "step": 6900 }, { "epoch": 13.232514177693762, "grad_norm": 0.8089916110038757, "learning_rate": 0.0005, "loss": 0.3698, "step": 7000 }, { "epoch": 13.421550094517958, "grad_norm": 1.0659642219543457, "learning_rate": 0.0005, "loss": 0.3971, "step": 7100 }, { "epoch": 13.610586011342155, "grad_norm": 1.2195113897323608, "learning_rate": 0.0005, "loss": 0.4213, "step": 7200 }, { "epoch": 13.799621928166351, "grad_norm": 1.1026618480682373, "learning_rate": 0.0005, "loss": 0.4487, "step": 7300 }, { "epoch": 13.988657844990549, "grad_norm": 1.120952844619751, "learning_rate": 0.0005, "loss": 0.467, "step": 7400 }, { "epoch": 14.0, "eval_accuracy": 0.5624102564102564, "eval_loss": 2.1781177520751953, "eval_runtime": 5.7664, "eval_samples_per_second": 86.709, "eval_steps_per_second": 10.925, "step": 7406 }, { "epoch": 14.177693761814744, "grad_norm": 0.9397806525230408, "learning_rate": 0.0005, "loss": 0.3685, "step": 7500 }, { "epoch": 14.366729678638942, "grad_norm": 0.8353122472763062, "learning_rate": 0.0005, "loss": 0.3828, "step": 7600 }, { "epoch": 14.555765595463138, "grad_norm": 1.0425219535827637, "learning_rate": 0.0005, "loss": 0.4026, "step": 7700 }, { "epoch": 14.744801512287335, "grad_norm": 1.0660319328308105, "learning_rate": 0.0005, "loss": 0.4287, "step": 7800 }, { "epoch": 14.93383742911153, "grad_norm": 1.131705403327942, "learning_rate": 0.0005, "loss": 0.4613, "step": 7900 }, { "epoch": 15.0, "eval_accuracy": 0.5611794871794872, "eval_loss": 2.231194019317627, "eval_runtime": 5.8033, "eval_samples_per_second": 86.158, "eval_steps_per_second": 10.856, "step": 7935 }, { "epoch": 15.122873345935728, "grad_norm": 0.8532615303993225, "learning_rate": 0.0005, "loss": 0.3889, "step": 8000 }, { "epoch": 15.311909262759924, "grad_norm": 0.9682427048683167, "learning_rate": 0.0005, "loss": 0.3675, "step": 8100 }, { "epoch": 15.500945179584122, "grad_norm": 0.9799471497535706, "learning_rate": 0.0005, "loss": 0.3968, "step": 8200 }, { "epoch": 15.689981096408317, "grad_norm": 0.9278863072395325, "learning_rate": 0.0005, "loss": 0.4198, "step": 8300 }, { "epoch": 15.879017013232515, "grad_norm": 1.0345618724822998, "learning_rate": 0.0005, "loss": 0.4405, "step": 8400 }, { "epoch": 16.0, "eval_accuracy": 0.562923076923077, "eval_loss": 2.1799755096435547, "eval_runtime": 5.8871, "eval_samples_per_second": 84.931, "eval_steps_per_second": 10.701, "step": 8464 }, { "epoch": 16.068052930056712, "grad_norm": 0.9081653356552124, "learning_rate": 0.0005, "loss": 0.4241, "step": 8500 }, { "epoch": 16.257088846880908, "grad_norm": 0.8579433560371399, "learning_rate": 0.0005, "loss": 0.3634, "step": 8600 }, { "epoch": 16.446124763705104, "grad_norm": 1.042361855506897, "learning_rate": 0.0005, "loss": 0.3837, "step": 8700 }, { "epoch": 16.6351606805293, "grad_norm": 1.032463550567627, "learning_rate": 0.0005, "loss": 0.4071, "step": 8800 }, { "epoch": 16.8241965973535, "grad_norm": 1.0003693103790283, "learning_rate": 0.0005, "loss": 0.4308, "step": 8900 }, { "epoch": 17.0, "eval_accuracy": 0.5627692307692308, "eval_loss": 2.1960062980651855, "eval_runtime": 6.0711, "eval_samples_per_second": 82.358, "eval_steps_per_second": 10.377, "step": 8993 }, { "epoch": 17.013232514177695, "grad_norm": 0.793249249458313, "learning_rate": 0.0005, "loss": 0.4471, "step": 9000 }, { "epoch": 17.20226843100189, "grad_norm": 0.9027940630912781, "learning_rate": 0.0005, "loss": 0.3499, "step": 9100 }, { "epoch": 17.391304347826086, "grad_norm": 0.9174290299415588, "learning_rate": 0.0005, "loss": 0.3698, "step": 9200 }, { "epoch": 17.58034026465028, "grad_norm": 1.135392665863037, "learning_rate": 0.0005, "loss": 0.4027, "step": 9300 }, { "epoch": 17.76937618147448, "grad_norm": 0.9852614998817444, "learning_rate": 0.0005, "loss": 0.4179, "step": 9400 }, { "epoch": 17.958412098298677, "grad_norm": 0.9487043619155884, "learning_rate": 0.0005, "loss": 0.4401, "step": 9500 }, { "epoch": 18.0, "eval_accuracy": 0.560974358974359, "eval_loss": 2.2354702949523926, "eval_runtime": 5.7571, "eval_samples_per_second": 86.85, "eval_steps_per_second": 10.943, "step": 9522 }, { "epoch": 18.147448015122873, "grad_norm": 0.7868739366531372, "learning_rate": 0.0005, "loss": 0.3637, "step": 9600 }, { "epoch": 18.33648393194707, "grad_norm": 0.962329626083374, "learning_rate": 0.0005, "loss": 0.3642, "step": 9700 }, { "epoch": 18.525519848771268, "grad_norm": 0.9594590663909912, "learning_rate": 0.0005, "loss": 0.3865, "step": 9800 }, { "epoch": 18.714555765595463, "grad_norm": 1.0574795007705688, "learning_rate": 0.0005, "loss": 0.4081, "step": 9900 }, { "epoch": 18.90359168241966, "grad_norm": 1.0566685199737549, "learning_rate": 0.0005, "loss": 0.4334, "step": 10000 }, { "epoch": 19.0, "eval_accuracy": 0.5608205128205128, "eval_loss": 2.2380332946777344, "eval_runtime": 5.7808, "eval_samples_per_second": 86.493, "eval_steps_per_second": 10.898, "step": 10051 }, { "epoch": 19.092627599243855, "grad_norm": 0.6632242798805237, "learning_rate": 0.0005, "loss": 0.393, "step": 10100 }, { "epoch": 19.281663516068054, "grad_norm": 0.8786609768867493, "learning_rate": 0.0005, "loss": 0.3528, "step": 10200 }, { "epoch": 19.47069943289225, "grad_norm": 1.0305085182189941, "learning_rate": 0.0005, "loss": 0.3752, "step": 10300 }, { "epoch": 19.659735349716446, "grad_norm": 0.9528347849845886, "learning_rate": 0.0005, "loss": 0.4023, "step": 10400 }, { "epoch": 19.84877126654064, "grad_norm": 0.9051763415336609, "learning_rate": 0.0005, "loss": 0.4218, "step": 10500 }, { "epoch": 20.0, "eval_accuracy": 0.560974358974359, "eval_loss": 2.2417104244232178, "eval_runtime": 5.7479, "eval_samples_per_second": 86.989, "eval_steps_per_second": 10.961, "step": 10580 }, { "epoch": 20.0, "step": 10580, "total_flos": 6.517631969856061e+17, "train_loss": 0.6977483630405708, "train_runtime": 24342.3236, "train_samples_per_second": 13.906, "train_steps_per_second": 0.435 } ], "logging_steps": 100, "max_steps": 10580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 6.517631969856061e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }