{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 10580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1890359168241966, "grad_norm": 0.24623066186904907, "learning_rate": 3e-05, "loss": 2.0175, "step": 100 }, { "epoch": 0.3780718336483932, "grad_norm": 0.3154822587966919, "learning_rate": 3e-05, "loss": 1.9025, "step": 200 }, { "epoch": 0.5671077504725898, "grad_norm": 0.2886754274368286, "learning_rate": 3e-05, "loss": 1.8754, "step": 300 }, { "epoch": 0.7561436672967864, "grad_norm": 0.33380141854286194, "learning_rate": 3e-05, "loss": 1.8795, "step": 400 }, { "epoch": 0.945179584120983, "grad_norm": 0.3052491247653961, "learning_rate": 3e-05, "loss": 1.86, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.605677130044843, "eval_loss": 1.6920065879821777, "eval_runtime": 6.0828, "eval_samples_per_second": 82.199, "eval_steps_per_second": 10.357, "step": 529 }, { "epoch": 1.1342155009451795, "grad_norm": 0.38703393936157227, "learning_rate": 3e-05, "loss": 1.8651, "step": 600 }, { "epoch": 1.3232514177693762, "grad_norm": 0.5257138609886169, "learning_rate": 3e-05, "loss": 1.8238, "step": 700 }, { "epoch": 1.5122873345935728, "grad_norm": 0.5151342749595642, "learning_rate": 3e-05, "loss": 1.8252, "step": 800 }, { "epoch": 1.7013232514177694, "grad_norm": 0.4788012206554413, "learning_rate": 3e-05, "loss": 1.8178, "step": 900 }, { "epoch": 1.8903591682419658, "grad_norm": 0.506370484828949, "learning_rate": 3e-05, "loss": 1.8271, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.6118744394618834, "eval_loss": 1.6425946950912476, "eval_runtime": 6.042, "eval_samples_per_second": 82.754, "eval_steps_per_second": 10.427, "step": 1058 }, { "epoch": 2.0793950850661624, "grad_norm": 0.537693440914154, "learning_rate": 3e-05, "loss": 1.795, "step": 1100 }, { "epoch": 2.268431001890359, "grad_norm": 0.6806929111480713, "learning_rate": 3e-05, "loss": 1.773, "step": 1200 }, { "epoch": 2.4574669187145557, "grad_norm": 0.7064961194992065, "learning_rate": 3e-05, "loss": 1.7609, "step": 1300 }, { "epoch": 2.6465028355387523, "grad_norm": 0.7571425437927246, "learning_rate": 3e-05, "loss": 1.7536, "step": 1400 }, { "epoch": 2.835538752362949, "grad_norm": 0.8069006204605103, "learning_rate": 3e-05, "loss": 1.7324, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.6163408071748879, "eval_loss": 1.59978187084198, "eval_runtime": 6.4372, "eval_samples_per_second": 77.673, "eval_steps_per_second": 9.787, "step": 1587 }, { "epoch": 3.0245746691871456, "grad_norm": 0.7909080982208252, "learning_rate": 3e-05, "loss": 1.7206, "step": 1600 }, { "epoch": 3.213610586011342, "grad_norm": 0.8461490869522095, "learning_rate": 3e-05, "loss": 1.6696, "step": 1700 }, { "epoch": 3.402646502835539, "grad_norm": 0.9591927528381348, "learning_rate": 3e-05, "loss": 1.672, "step": 1800 }, { "epoch": 3.5916824196597354, "grad_norm": 1.0703200101852417, "learning_rate": 3e-05, "loss": 1.6712, "step": 1900 }, { "epoch": 3.780718336483932, "grad_norm": 0.9409723281860352, "learning_rate": 3e-05, "loss": 1.6877, "step": 2000 }, { "epoch": 3.9697542533081287, "grad_norm": 0.9087811708450317, "learning_rate": 3e-05, "loss": 1.6818, "step": 2100 }, { "epoch": 4.0, "eval_accuracy": 0.621982062780269, "eval_loss": 1.5579897165298462, "eval_runtime": 6.0535, "eval_samples_per_second": 82.597, "eval_steps_per_second": 10.407, "step": 2116 }, { "epoch": 4.158790170132325, "grad_norm": 1.0507543087005615, "learning_rate": 3e-05, "loss": 1.6195, "step": 2200 }, { "epoch": 4.3478260869565215, "grad_norm": 1.1318222284317017, "learning_rate": 3e-05, "loss": 1.6039, "step": 2300 }, { "epoch": 4.536862003780718, "grad_norm": 1.1028773784637451, "learning_rate": 3e-05, "loss": 1.5924, "step": 2400 }, { "epoch": 4.725897920604915, "grad_norm": 1.1365679502487183, "learning_rate": 3e-05, "loss": 1.5903, "step": 2500 }, { "epoch": 4.914933837429111, "grad_norm": 1.0776621103286743, "learning_rate": 3e-05, "loss": 1.5864, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.6278206278026905, "eval_loss": 1.5211377143859863, "eval_runtime": 6.0103, "eval_samples_per_second": 83.19, "eval_steps_per_second": 10.482, "step": 2645 }, { "epoch": 5.103969754253308, "grad_norm": 1.1829136610031128, "learning_rate": 3e-05, "loss": 1.5478, "step": 2700 }, { "epoch": 5.293005671077505, "grad_norm": 1.1871124505996704, "learning_rate": 3e-05, "loss": 1.5151, "step": 2800 }, { "epoch": 5.482041587901701, "grad_norm": 1.2089450359344482, "learning_rate": 3e-05, "loss": 1.5159, "step": 2900 }, { "epoch": 5.671077504725898, "grad_norm": 1.3170595169067383, "learning_rate": 3e-05, "loss": 1.5241, "step": 3000 }, { "epoch": 5.8601134215500945, "grad_norm": 1.439405083656311, "learning_rate": 3e-05, "loss": 1.5204, "step": 3100 }, { "epoch": 6.0, "eval_accuracy": 0.6326726457399103, "eval_loss": 1.4863040447235107, "eval_runtime": 6.0604, "eval_samples_per_second": 82.502, "eval_steps_per_second": 10.395, "step": 3174 }, { "epoch": 6.049149338374291, "grad_norm": 1.3573708534240723, "learning_rate": 3e-05, "loss": 1.506, "step": 3200 }, { "epoch": 6.238185255198488, "grad_norm": 1.3852615356445312, "learning_rate": 3e-05, "loss": 1.4401, "step": 3300 }, { "epoch": 6.427221172022684, "grad_norm": 1.391204595565796, "learning_rate": 3e-05, "loss": 1.4469, "step": 3400 }, { "epoch": 6.616257088846881, "grad_norm": 1.5318756103515625, "learning_rate": 3e-05, "loss": 1.4608, "step": 3500 }, { "epoch": 6.805293005671078, "grad_norm": 1.4233124256134033, "learning_rate": 3e-05, "loss": 1.4419, "step": 3600 }, { "epoch": 6.994328922495274, "grad_norm": 1.4444003105163574, "learning_rate": 3e-05, "loss": 1.4481, "step": 3700 }, { "epoch": 7.0, "eval_accuracy": 0.6371928251121076, "eval_loss": 1.4516844749450684, "eval_runtime": 6.0218, "eval_samples_per_second": 83.032, "eval_steps_per_second": 10.462, "step": 3703 }, { "epoch": 7.183364839319471, "grad_norm": 1.607857346534729, "learning_rate": 3e-05, "loss": 1.3742, "step": 3800 }, { "epoch": 7.3724007561436675, "grad_norm": 1.5741080045700073, "learning_rate": 3e-05, "loss": 1.383, "step": 3900 }, { "epoch": 7.561436672967864, "grad_norm": 1.6515594720840454, "learning_rate": 3e-05, "loss": 1.3734, "step": 4000 }, { "epoch": 7.750472589792061, "grad_norm": 1.704078197479248, "learning_rate": 3e-05, "loss": 1.3527, "step": 4100 }, { "epoch": 7.939508506616257, "grad_norm": 1.7636491060256958, "learning_rate": 3e-05, "loss": 1.3768, "step": 4200 }, { "epoch": 8.0, "eval_accuracy": 0.6429058295964125, "eval_loss": 1.4120872020721436, "eval_runtime": 6.7429, "eval_samples_per_second": 74.152, "eval_steps_per_second": 9.343, "step": 4232 }, { "epoch": 8.128544423440454, "grad_norm": 1.691927433013916, "learning_rate": 3e-05, "loss": 1.3254, "step": 4300 }, { "epoch": 8.31758034026465, "grad_norm": 1.695786476135254, "learning_rate": 3e-05, "loss": 1.2819, "step": 4400 }, { "epoch": 8.506616257088847, "grad_norm": 1.8343569040298462, "learning_rate": 3e-05, "loss": 1.3225, "step": 4500 }, { "epoch": 8.695652173913043, "grad_norm": 1.678654670715332, "learning_rate": 3e-05, "loss": 1.2961, "step": 4600 }, { "epoch": 8.88468809073724, "grad_norm": 2.0212690830230713, "learning_rate": 3e-05, "loss": 1.2946, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.6481973094170403, "eval_loss": 1.3739463090896606, "eval_runtime": 6.0696, "eval_samples_per_second": 82.377, "eval_steps_per_second": 10.38, "step": 4761 }, { "epoch": 9.073724007561436, "grad_norm": 1.8944847583770752, "learning_rate": 3e-05, "loss": 1.2831, "step": 4800 }, { "epoch": 9.262759924385634, "grad_norm": 1.9275157451629639, "learning_rate": 3e-05, "loss": 1.2022, "step": 4900 }, { "epoch": 9.45179584120983, "grad_norm": 1.927822470664978, "learning_rate": 3e-05, "loss": 1.2259, "step": 5000 }, { "epoch": 9.640831758034027, "grad_norm": 1.8706145286560059, "learning_rate": 3e-05, "loss": 1.2231, "step": 5100 }, { "epoch": 9.829867674858223, "grad_norm": 1.858546257019043, "learning_rate": 3e-05, "loss": 1.243, "step": 5200 }, { "epoch": 10.0, "eval_accuracy": 0.653237668161435, "eval_loss": 1.3363806009292603, "eval_runtime": 6.6274, "eval_samples_per_second": 75.444, "eval_steps_per_second": 9.506, "step": 5290 }, { "epoch": 10.01890359168242, "grad_norm": 2.2799806594848633, "learning_rate": 3e-05, "loss": 1.2223, "step": 5300 }, { "epoch": 10.207939508506616, "grad_norm": 2.0888822078704834, "learning_rate": 3e-05, "loss": 1.1567, "step": 5400 }, { "epoch": 10.396975425330814, "grad_norm": 2.1389222145080566, "learning_rate": 3e-05, "loss": 1.1421, "step": 5500 }, { "epoch": 10.58601134215501, "grad_norm": 2.253117322921753, "learning_rate": 3e-05, "loss": 1.1525, "step": 5600 }, { "epoch": 10.775047258979207, "grad_norm": 2.3171472549438477, "learning_rate": 3e-05, "loss": 1.168, "step": 5700 }, { "epoch": 10.964083175803403, "grad_norm": 2.1451621055603027, "learning_rate": 3e-05, "loss": 1.1425, "step": 5800 }, { "epoch": 11.0, "eval_accuracy": 0.6593991031390134, "eval_loss": 1.2968207597732544, "eval_runtime": 5.9969, "eval_samples_per_second": 83.377, "eval_steps_per_second": 10.506, "step": 5819 }, { "epoch": 11.1531190926276, "grad_norm": 2.0139310359954834, "learning_rate": 3e-05, "loss": 1.0814, "step": 5900 }, { "epoch": 11.342155009451796, "grad_norm": 2.2693676948547363, "learning_rate": 3e-05, "loss": 1.0731, "step": 6000 }, { "epoch": 11.531190926275993, "grad_norm": 2.4319376945495605, "learning_rate": 3e-05, "loss": 1.0842, "step": 6100 }, { "epoch": 11.720226843100189, "grad_norm": 2.103640079498291, "learning_rate": 3e-05, "loss": 1.0839, "step": 6200 }, { "epoch": 11.909262759924385, "grad_norm": 2.309508800506592, "learning_rate": 3e-05, "loss": 1.0847, "step": 6300 }, { "epoch": 12.0, "eval_accuracy": 0.6652017937219731, "eval_loss": 1.2538820505142212, "eval_runtime": 6.0994, "eval_samples_per_second": 81.975, "eval_steps_per_second": 10.329, "step": 6348 }, { "epoch": 12.098298676748582, "grad_norm": 2.184951066970825, "learning_rate": 3e-05, "loss": 1.0154, "step": 6400 }, { "epoch": 12.287334593572778, "grad_norm": 2.6142184734344482, "learning_rate": 3e-05, "loss": 1.0129, "step": 6500 }, { "epoch": 12.476370510396976, "grad_norm": 2.377812147140503, "learning_rate": 3e-05, "loss": 1.01, "step": 6600 }, { "epoch": 12.665406427221171, "grad_norm": 2.252695083618164, "learning_rate": 3e-05, "loss": 1.0132, "step": 6700 }, { "epoch": 12.854442344045369, "grad_norm": 2.4628734588623047, "learning_rate": 3e-05, "loss": 1.0152, "step": 6800 }, { "epoch": 13.0, "eval_accuracy": 0.670609865470852, "eval_loss": 1.216369390487671, "eval_runtime": 6.115, "eval_samples_per_second": 81.766, "eval_steps_per_second": 10.302, "step": 6877 }, { "epoch": 13.043478260869565, "grad_norm": 2.2027320861816406, "learning_rate": 3e-05, "loss": 1.0047, "step": 6900 }, { "epoch": 13.232514177693762, "grad_norm": 2.1473910808563232, "learning_rate": 3e-05, "loss": 0.9308, "step": 7000 }, { "epoch": 13.421550094517958, "grad_norm": 2.633467435836792, "learning_rate": 3e-05, "loss": 0.9265, "step": 7100 }, { "epoch": 13.610586011342155, "grad_norm": 2.5691654682159424, "learning_rate": 3e-05, "loss": 0.9485, "step": 7200 }, { "epoch": 13.799621928166351, "grad_norm": 2.6028811931610107, "learning_rate": 3e-05, "loss": 0.9502, "step": 7300 }, { "epoch": 13.988657844990549, "grad_norm": 2.6901166439056396, "learning_rate": 3e-05, "loss": 0.9498, "step": 7400 }, { "epoch": 14.0, "eval_accuracy": 0.6757937219730942, "eval_loss": 1.176974892616272, "eval_runtime": 6.0622, "eval_samples_per_second": 82.479, "eval_steps_per_second": 10.392, "step": 7406 }, { "epoch": 14.177693761814744, "grad_norm": 2.624717950820923, "learning_rate": 3e-05, "loss": 0.8708, "step": 7500 }, { "epoch": 14.366729678638942, "grad_norm": 2.5228078365325928, "learning_rate": 3e-05, "loss": 0.8685, "step": 7600 }, { "epoch": 14.555765595463138, "grad_norm": 2.8440725803375244, "learning_rate": 3e-05, "loss": 0.88, "step": 7700 }, { "epoch": 14.744801512287335, "grad_norm": 2.5001230239868164, "learning_rate": 3e-05, "loss": 0.8968, "step": 7800 }, { "epoch": 14.93383742911153, "grad_norm": 2.868446111679077, "learning_rate": 3e-05, "loss": 0.8652, "step": 7900 }, { "epoch": 15.0, "eval_accuracy": 0.6820717488789237, "eval_loss": 1.1323232650756836, "eval_runtime": 6.0411, "eval_samples_per_second": 82.767, "eval_steps_per_second": 10.429, "step": 7935 }, { "epoch": 15.122873345935728, "grad_norm": 3.236562728881836, "learning_rate": 3e-05, "loss": 0.8354, "step": 8000 }, { "epoch": 15.311909262759924, "grad_norm": 3.0811691284179688, "learning_rate": 3e-05, "loss": 0.8042, "step": 8100 }, { "epoch": 15.500945179584122, "grad_norm": 2.6125237941741943, "learning_rate": 3e-05, "loss": 0.8097, "step": 8200 }, { "epoch": 15.689981096408317, "grad_norm": 2.8925766944885254, "learning_rate": 3e-05, "loss": 0.8133, "step": 8300 }, { "epoch": 15.879017013232515, "grad_norm": 2.927067518234253, "learning_rate": 3e-05, "loss": 0.8265, "step": 8400 }, { "epoch": 16.0, "eval_accuracy": 0.690134529147982, "eval_loss": 1.0825910568237305, "eval_runtime": 6.0821, "eval_samples_per_second": 82.209, "eval_steps_per_second": 10.358, "step": 8464 }, { "epoch": 16.068052930056712, "grad_norm": 2.8401355743408203, "learning_rate": 3e-05, "loss": 0.7933, "step": 8500 }, { "epoch": 16.257088846880908, "grad_norm": 3.2201480865478516, "learning_rate": 3e-05, "loss": 0.7449, "step": 8600 }, { "epoch": 16.446124763705104, "grad_norm": 3.2903711795806885, "learning_rate": 3e-05, "loss": 0.7565, "step": 8700 }, { "epoch": 16.6351606805293, "grad_norm": 3.153130292892456, "learning_rate": 3e-05, "loss": 0.773, "step": 8800 }, { "epoch": 16.8241965973535, "grad_norm": 2.8367865085601807, "learning_rate": 3e-05, "loss": 0.7432, "step": 8900 }, { "epoch": 17.0, "eval_accuracy": 0.6959910313901345, "eval_loss": 1.04633367061615, "eval_runtime": 6.1125, "eval_samples_per_second": 81.8, "eval_steps_per_second": 10.307, "step": 8993 }, { "epoch": 17.013232514177695, "grad_norm": 3.268256664276123, "learning_rate": 3e-05, "loss": 0.7576, "step": 9000 }, { "epoch": 17.20226843100189, "grad_norm": 3.0888004302978516, "learning_rate": 3e-05, "loss": 0.6879, "step": 9100 }, { "epoch": 17.391304347826086, "grad_norm": 2.908324718475342, "learning_rate": 3e-05, "loss": 0.6869, "step": 9200 }, { "epoch": 17.58034026465028, "grad_norm": 4.358062267303467, "learning_rate": 3e-05, "loss": 0.7027, "step": 9300 }, { "epoch": 17.76937618147448, "grad_norm": 3.1521716117858887, "learning_rate": 3e-05, "loss": 0.7188, "step": 9400 }, { "epoch": 17.958412098298677, "grad_norm": 3.1939432621002197, "learning_rate": 3e-05, "loss": 0.7106, "step": 9500 }, { "epoch": 18.0, "eval_accuracy": 0.7021973094170404, "eval_loss": 1.0097612142562866, "eval_runtime": 5.6404, "eval_samples_per_second": 88.646, "eval_steps_per_second": 11.169, "step": 9522 }, { "epoch": 18.147448015122873, "grad_norm": 3.3058857917785645, "learning_rate": 3e-05, "loss": 0.6526, "step": 9600 }, { "epoch": 18.33648393194707, "grad_norm": 2.845630645751953, "learning_rate": 3e-05, "loss": 0.6375, "step": 9700 }, { "epoch": 18.525519848771268, "grad_norm": 3.524501323699951, "learning_rate": 3e-05, "loss": 0.65, "step": 9800 }, { "epoch": 18.714555765595463, "grad_norm": 3.5869174003601074, "learning_rate": 3e-05, "loss": 0.6529, "step": 9900 }, { "epoch": 18.90359168241966, "grad_norm": 2.8486292362213135, "learning_rate": 3e-05, "loss": 0.669, "step": 10000 }, { "epoch": 19.0, "eval_accuracy": 0.7077937219730942, "eval_loss": 0.9695693254470825, "eval_runtime": 6.1993, "eval_samples_per_second": 80.654, "eval_steps_per_second": 10.162, "step": 10051 }, { "epoch": 19.092627599243855, "grad_norm": 2.8552756309509277, "learning_rate": 3e-05, "loss": 0.6183, "step": 10100 }, { "epoch": 19.281663516068054, "grad_norm": 2.949084758758545, "learning_rate": 3e-05, "loss": 0.5824, "step": 10200 }, { "epoch": 19.47069943289225, "grad_norm": 3.137747049331665, "learning_rate": 3e-05, "loss": 0.6093, "step": 10300 }, { "epoch": 19.659735349716446, "grad_norm": 2.7827863693237305, "learning_rate": 3e-05, "loss": 0.6155, "step": 10400 }, { "epoch": 19.84877126654064, "grad_norm": 3.2223191261291504, "learning_rate": 3e-05, "loss": 0.6043, "step": 10500 }, { "epoch": 20.0, "eval_accuracy": 0.7134618834080717, "eval_loss": 0.9358564019203186, "eval_runtime": 6.1149, "eval_samples_per_second": 81.767, "eval_steps_per_second": 10.303, "step": 10580 }, { "epoch": 20.0, "step": 10580, "total_flos": 9.64242245391745e+17, "train_loss": 1.2100651977192927, "train_runtime": 22841.5406, "train_samples_per_second": 14.819, "train_steps_per_second": 0.463 } ], "logging_steps": 100, "max_steps": 10580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 9.64242245391745e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }