{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 10580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1890359168241966, "grad_norm": 0.37213513255119324, "learning_rate": 0.0003, "loss": 1.9201, "step": 100 }, { "epoch": 0.3780718336483932, "grad_norm": 0.5696528553962708, "learning_rate": 0.0003, "loss": 1.8659, "step": 200 }, { "epoch": 0.5671077504725898, "grad_norm": 0.44589683413505554, "learning_rate": 0.0003, "loss": 1.8382, "step": 300 }, { "epoch": 0.7561436672967864, "grad_norm": 0.4036942720413208, "learning_rate": 0.0003, "loss": 1.8431, "step": 400 }, { "epoch": 0.945179584120983, "grad_norm": 0.45892009139060974, "learning_rate": 0.0003, "loss": 1.8369, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.575076923076923, "eval_loss": 1.6031557321548462, "eval_runtime": 5.71, "eval_samples_per_second": 87.566, "eval_steps_per_second": 11.033, "step": 529 }, { "epoch": 1.1342155009451795, "grad_norm": 0.4805261194705963, "learning_rate": 0.0003, "loss": 1.7006, "step": 600 }, { "epoch": 1.3232514177693762, "grad_norm": 0.4189806878566742, "learning_rate": 0.0003, "loss": 1.6276, "step": 700 }, { "epoch": 1.5122873345935728, "grad_norm": 0.4777927100658417, "learning_rate": 0.0003, "loss": 1.6294, "step": 800 }, { "epoch": 1.7013232514177694, "grad_norm": 0.4425320327281952, "learning_rate": 0.0003, "loss": 1.6196, "step": 900 }, { "epoch": 1.8903591682419658, "grad_norm": 0.5979380011558533, "learning_rate": 0.0003, "loss": 1.6451, "step": 1000 }, { "epoch": 2.0, "eval_accuracy": 0.5745641025641026, "eval_loss": 1.635698676109314, "eval_runtime": 5.9951, "eval_samples_per_second": 83.401, "eval_steps_per_second": 10.509, "step": 1058 }, { "epoch": 2.0793950850661624, "grad_norm": 0.5178828239440918, "learning_rate": 0.0003, "loss": 1.514, "step": 1100 }, { "epoch": 2.268431001890359, "grad_norm": 0.7963550090789795, "learning_rate": 0.0003, "loss": 1.3597, "step": 1200 }, { "epoch": 2.4574669187145557, "grad_norm": 0.7210434675216675, "learning_rate": 0.0003, "loss": 1.3774, "step": 1300 }, { "epoch": 2.6465028355387523, "grad_norm": 0.6484540700912476, "learning_rate": 0.0003, "loss": 1.374, "step": 1400 }, { "epoch": 2.835538752362949, "grad_norm": 0.6798349618911743, "learning_rate": 0.0003, "loss": 1.3703, "step": 1500 }, { "epoch": 3.0, "eval_accuracy": 0.5716410256410256, "eval_loss": 1.7677161693572998, "eval_runtime": 5.9596, "eval_samples_per_second": 83.899, "eval_steps_per_second": 10.571, "step": 1587 }, { "epoch": 3.0245746691871456, "grad_norm": 0.6527106761932373, "learning_rate": 0.0003, "loss": 1.3419, "step": 1600 }, { "epoch": 3.213610586011342, "grad_norm": 0.6613947749137878, "learning_rate": 0.0003, "loss": 1.0932, "step": 1700 }, { "epoch": 3.402646502835539, "grad_norm": 0.7362242341041565, "learning_rate": 0.0003, "loss": 1.1188, "step": 1800 }, { "epoch": 3.5916824196597354, "grad_norm": 0.6629425287246704, "learning_rate": 0.0003, "loss": 1.142, "step": 1900 }, { "epoch": 3.780718336483932, "grad_norm": 0.789070725440979, "learning_rate": 0.0003, "loss": 1.1661, "step": 2000 }, { "epoch": 3.9697542533081287, "grad_norm": 0.7567113637924194, "learning_rate": 0.0003, "loss": 1.1817, "step": 2100 }, { "epoch": 4.0, "eval_accuracy": 0.5718461538461539, "eval_loss": 1.8587489128112793, "eval_runtime": 5.7528, "eval_samples_per_second": 86.914, "eval_steps_per_second": 10.951, "step": 2116 }, { "epoch": 4.158790170132325, "grad_norm": 0.7041072249412537, "learning_rate": 0.0003, "loss": 0.9335, "step": 2200 }, { "epoch": 4.3478260869565215, "grad_norm": 0.7739288210868835, "learning_rate": 0.0003, "loss": 0.9257, "step": 2300 }, { "epoch": 4.536862003780718, "grad_norm": 0.9699936509132385, "learning_rate": 0.0003, "loss": 0.9234, "step": 2400 }, { "epoch": 4.725897920604915, "grad_norm": 0.8464515209197998, "learning_rate": 0.0003, "loss": 0.9547, "step": 2500 }, { "epoch": 4.914933837429111, "grad_norm": 0.7238239049911499, "learning_rate": 0.0003, "loss": 0.9674, "step": 2600 }, { "epoch": 5.0, "eval_accuracy": 0.5712820512820512, "eval_loss": 1.931915283203125, "eval_runtime": 5.7956, "eval_samples_per_second": 86.272, "eval_steps_per_second": 10.87, "step": 2645 }, { "epoch": 5.103969754253308, "grad_norm": 0.8031799793243408, "learning_rate": 0.0003, "loss": 0.8353, "step": 2700 }, { "epoch": 5.293005671077505, "grad_norm": 0.862354576587677, "learning_rate": 0.0003, "loss": 0.731, "step": 2800 }, { "epoch": 5.482041587901701, "grad_norm": 0.9067243337631226, "learning_rate": 0.0003, "loss": 0.7515, "step": 2900 }, { "epoch": 5.671077504725898, "grad_norm": 0.9791676998138428, "learning_rate": 0.0003, "loss": 0.7769, "step": 3000 }, { "epoch": 5.8601134215500945, "grad_norm": 0.9806828498840332, "learning_rate": 0.0003, "loss": 0.7936, "step": 3100 }, { "epoch": 6.0, "eval_accuracy": 0.5704102564102564, "eval_loss": 1.993375301361084, "eval_runtime": 5.8166, "eval_samples_per_second": 85.96, "eval_steps_per_second": 10.831, "step": 3174 }, { "epoch": 6.049149338374291, "grad_norm": 0.8496273159980774, "learning_rate": 0.0003, "loss": 0.749, "step": 3200 }, { "epoch": 6.238185255198488, "grad_norm": 0.8800780177116394, "learning_rate": 0.0003, "loss": 0.5822, "step": 3300 }, { "epoch": 6.427221172022684, "grad_norm": 0.8136658668518066, "learning_rate": 0.0003, "loss": 0.6114, "step": 3400 }, { "epoch": 6.616257088846881, "grad_norm": 1.0112674236297607, "learning_rate": 0.0003, "loss": 0.6329, "step": 3500 }, { "epoch": 6.805293005671078, "grad_norm": 0.8546850681304932, "learning_rate": 0.0003, "loss": 0.6499, "step": 3600 }, { "epoch": 6.994328922495274, "grad_norm": 0.947127640247345, "learning_rate": 0.0003, "loss": 0.67, "step": 3700 }, { "epoch": 7.0, "eval_accuracy": 0.5683589743589743, "eval_loss": 2.046682357788086, "eval_runtime": 5.9678, "eval_samples_per_second": 83.784, "eval_steps_per_second": 10.557, "step": 3703 }, { "epoch": 7.183364839319471, "grad_norm": 1.0099776983261108, "learning_rate": 0.0003, "loss": 0.477, "step": 3800 }, { "epoch": 7.3724007561436675, "grad_norm": 0.8788864016532898, "learning_rate": 0.0003, "loss": 0.4951, "step": 3900 }, { "epoch": 7.561436672967864, "grad_norm": 0.9243162274360657, "learning_rate": 0.0003, "loss": 0.5141, "step": 4000 }, { "epoch": 7.750472589792061, "grad_norm": 1.0089187622070312, "learning_rate": 0.0003, "loss": 0.5317, "step": 4100 }, { "epoch": 7.939508506616257, "grad_norm": 0.9674586057662964, "learning_rate": 0.0003, "loss": 0.5604, "step": 4200 }, { "epoch": 8.0, "eval_accuracy": 0.5692820512820512, "eval_loss": 2.121832847595215, "eval_runtime": 5.6926, "eval_samples_per_second": 87.833, "eval_steps_per_second": 11.067, "step": 4232 }, { "epoch": 8.128544423440454, "grad_norm": 0.8500985503196716, "learning_rate": 0.0003, "loss": 0.4532, "step": 4300 }, { "epoch": 8.31758034026465, "grad_norm": 0.8404316902160645, "learning_rate": 0.0003, "loss": 0.4173, "step": 4400 }, { "epoch": 8.506616257088847, "grad_norm": 0.9724924564361572, "learning_rate": 0.0003, "loss": 0.4386, "step": 4500 }, { "epoch": 8.695652173913043, "grad_norm": 0.9974843859672546, "learning_rate": 0.0003, "loss": 0.453, "step": 4600 }, { "epoch": 8.88468809073724, "grad_norm": 0.9835983514785767, "learning_rate": 0.0003, "loss": 0.4747, "step": 4700 }, { "epoch": 9.0, "eval_accuracy": 0.5682051282051283, "eval_loss": 2.1342432498931885, "eval_runtime": 5.6644, "eval_samples_per_second": 88.271, "eval_steps_per_second": 11.122, "step": 4761 }, { "epoch": 9.073724007561436, "grad_norm": 0.7832907438278198, "learning_rate": 0.0003, "loss": 0.4297, "step": 4800 }, { "epoch": 9.262759924385634, "grad_norm": 0.9495198130607605, "learning_rate": 0.0003, "loss": 0.3607, "step": 4900 }, { "epoch": 9.45179584120983, "grad_norm": 0.8801999688148499, "learning_rate": 0.0003, "loss": 0.3836, "step": 5000 }, { "epoch": 9.640831758034027, "grad_norm": 0.9312089681625366, "learning_rate": 0.0003, "loss": 0.4013, "step": 5100 }, { "epoch": 9.829867674858223, "grad_norm": 1.0668907165527344, "learning_rate": 0.0003, "loss": 0.4191, "step": 5200 }, { "epoch": 10.0, "eval_accuracy": 0.5673846153846154, "eval_loss": 2.1679139137268066, "eval_runtime": 5.6062, "eval_samples_per_second": 89.186, "eval_steps_per_second": 11.237, "step": 5290 }, { "epoch": 10.01890359168242, "grad_norm": 0.8078038096427917, "learning_rate": 0.0003, "loss": 0.42, "step": 5300 }, { "epoch": 10.207939508506616, "grad_norm": 0.9549182653427124, "learning_rate": 0.0003, "loss": 0.329, "step": 5400 }, { "epoch": 10.396975425330814, "grad_norm": 1.0073105096817017, "learning_rate": 0.0003, "loss": 0.3521, "step": 5500 }, { "epoch": 10.58601134215501, "grad_norm": 1.0606422424316406, "learning_rate": 0.0003, "loss": 0.3671, "step": 5600 }, { "epoch": 10.775047258979207, "grad_norm": 0.8719451427459717, "learning_rate": 0.0003, "loss": 0.3818, "step": 5700 }, { "epoch": 10.964083175803403, "grad_norm": 1.0333566665649414, "learning_rate": 0.0003, "loss": 0.3971, "step": 5800 }, { "epoch": 11.0, "eval_accuracy": 0.5657948717948718, "eval_loss": 2.2080512046813965, "eval_runtime": 5.6645, "eval_samples_per_second": 88.27, "eval_steps_per_second": 11.122, "step": 5819 }, { "epoch": 11.1531190926276, "grad_norm": 0.84638911485672, "learning_rate": 0.0003, "loss": 0.3323, "step": 5900 }, { "epoch": 11.342155009451796, "grad_norm": 0.8573931455612183, "learning_rate": 0.0003, "loss": 0.3304, "step": 6000 }, { "epoch": 11.531190926275993, "grad_norm": 1.0611106157302856, "learning_rate": 0.0003, "loss": 0.3432, "step": 6100 }, { "epoch": 11.720226843100189, "grad_norm": 0.8618783950805664, "learning_rate": 0.0003, "loss": 0.3601, "step": 6200 }, { "epoch": 11.909262759924385, "grad_norm": 1.0319652557373047, "learning_rate": 0.0003, "loss": 0.3753, "step": 6300 }, { "epoch": 12.0, "eval_accuracy": 0.5663589743589743, "eval_loss": 2.184002637863159, "eval_runtime": 5.7032, "eval_samples_per_second": 87.67, "eval_steps_per_second": 11.046, "step": 6348 }, { "epoch": 12.098298676748582, "grad_norm": 0.9860585331916809, "learning_rate": 0.0003, "loss": 0.3454, "step": 6400 }, { "epoch": 12.287334593572778, "grad_norm": 0.8168879747390747, "learning_rate": 0.0003, "loss": 0.317, "step": 6500 }, { "epoch": 12.476370510396976, "grad_norm": 0.8994132876396179, "learning_rate": 0.0003, "loss": 0.3286, "step": 6600 }, { "epoch": 12.665406427221171, "grad_norm": 0.9378811120986938, "learning_rate": 0.0003, "loss": 0.34, "step": 6700 }, { "epoch": 12.854442344045369, "grad_norm": 0.8742515444755554, "learning_rate": 0.0003, "loss": 0.3571, "step": 6800 }, { "epoch": 13.0, "eval_accuracy": 0.5633846153846154, "eval_loss": 2.232431411743164, "eval_runtime": 5.6334, "eval_samples_per_second": 88.756, "eval_steps_per_second": 11.183, "step": 6877 }, { "epoch": 13.043478260869565, "grad_norm": 0.8000278472900391, "learning_rate": 0.0003, "loss": 0.3518, "step": 6900 }, { "epoch": 13.232514177693762, "grad_norm": 0.7633320689201355, "learning_rate": 0.0003, "loss": 0.3046, "step": 7000 }, { "epoch": 13.421550094517958, "grad_norm": 0.9149838089942932, "learning_rate": 0.0003, "loss": 0.3217, "step": 7100 }, { "epoch": 13.610586011342155, "grad_norm": 0.9108119010925293, "learning_rate": 0.0003, "loss": 0.3315, "step": 7200 }, { "epoch": 13.799621928166351, "grad_norm": 0.8181828260421753, "learning_rate": 0.0003, "loss": 0.3454, "step": 7300 }, { "epoch": 13.988657844990549, "grad_norm": 0.9064919352531433, "learning_rate": 0.0003, "loss": 0.3526, "step": 7400 }, { "epoch": 14.0, "eval_accuracy": 0.5631794871794872, "eval_loss": 2.2189676761627197, "eval_runtime": 5.8178, "eval_samples_per_second": 85.943, "eval_steps_per_second": 10.829, "step": 7406 }, { "epoch": 14.177693761814744, "grad_norm": 0.8295121788978577, "learning_rate": 0.0003, "loss": 0.3011, "step": 7500 }, { "epoch": 14.366729678638942, "grad_norm": 0.78739994764328, "learning_rate": 0.0003, "loss": 0.3091, "step": 7600 }, { "epoch": 14.555765595463138, "grad_norm": 0.9230947494506836, "learning_rate": 0.0003, "loss": 0.3178, "step": 7700 }, { "epoch": 14.744801512287335, "grad_norm": 0.8687440752983093, "learning_rate": 0.0003, "loss": 0.3298, "step": 7800 }, { "epoch": 14.93383742911153, "grad_norm": 0.7883646488189697, "learning_rate": 0.0003, "loss": 0.35, "step": 7900 }, { "epoch": 15.0, "eval_accuracy": 0.5638974358974359, "eval_loss": 2.208575487136841, "eval_runtime": 5.8338, "eval_samples_per_second": 85.707, "eval_steps_per_second": 10.799, "step": 7935 }, { "epoch": 15.122873345935728, "grad_norm": 0.8580202460289001, "learning_rate": 0.0003, "loss": 0.3089, "step": 8000 }, { "epoch": 15.311909262759924, "grad_norm": 1.0185339450836182, "learning_rate": 0.0003, "loss": 0.2966, "step": 8100 }, { "epoch": 15.500945179584122, "grad_norm": 1.0962390899658203, "learning_rate": 0.0003, "loss": 0.3135, "step": 8200 }, { "epoch": 15.689981096408317, "grad_norm": 0.9543150663375854, "learning_rate": 0.0003, "loss": 0.3258, "step": 8300 }, { "epoch": 15.879017013232515, "grad_norm": 0.9073179364204407, "learning_rate": 0.0003, "loss": 0.3323, "step": 8400 }, { "epoch": 16.0, "eval_accuracy": 0.5653846153846154, "eval_loss": 2.2654531002044678, "eval_runtime": 5.7427, "eval_samples_per_second": 87.067, "eval_steps_per_second": 10.97, "step": 8464 }, { "epoch": 16.068052930056712, "grad_norm": 0.9761775135993958, "learning_rate": 0.0003, "loss": 0.3298, "step": 8500 }, { "epoch": 16.257088846880908, "grad_norm": 0.8946503400802612, "learning_rate": 0.0003, "loss": 0.2945, "step": 8600 }, { "epoch": 16.446124763705104, "grad_norm": 0.8116426467895508, "learning_rate": 0.0003, "loss": 0.3051, "step": 8700 }, { "epoch": 16.6351606805293, "grad_norm": 0.8611814379692078, "learning_rate": 0.0003, "loss": 0.3145, "step": 8800 }, { "epoch": 16.8241965973535, "grad_norm": 0.9202536940574646, "learning_rate": 0.0003, "loss": 0.3281, "step": 8900 }, { "epoch": 17.0, "eval_accuracy": 0.5667179487179487, "eval_loss": 2.244356870651245, "eval_runtime": 5.9955, "eval_samples_per_second": 83.396, "eval_steps_per_second": 10.508, "step": 8993 }, { "epoch": 17.013232514177695, "grad_norm": 0.7778891324996948, "learning_rate": 0.0003, "loss": 0.3344, "step": 9000 }, { "epoch": 17.20226843100189, "grad_norm": 0.7352868914604187, "learning_rate": 0.0003, "loss": 0.2843, "step": 9100 }, { "epoch": 17.391304347826086, "grad_norm": 0.7922298908233643, "learning_rate": 0.0003, "loss": 0.2941, "step": 9200 }, { "epoch": 17.58034026465028, "grad_norm": 1.0191751718521118, "learning_rate": 0.0003, "loss": 0.3164, "step": 9300 }, { "epoch": 17.76937618147448, "grad_norm": 1.0436335802078247, "learning_rate": 0.0003, "loss": 0.321, "step": 9400 }, { "epoch": 17.958412098298677, "grad_norm": 0.8792089223861694, "learning_rate": 0.0003, "loss": 0.3328, "step": 9500 }, { "epoch": 18.0, "eval_accuracy": 0.5626153846153846, "eval_loss": 2.2596943378448486, "eval_runtime": 6.0856, "eval_samples_per_second": 82.161, "eval_steps_per_second": 10.352, "step": 9522 }, { "epoch": 18.147448015122873, "grad_norm": 0.6146367788314819, "learning_rate": 0.0003, "loss": 0.2967, "step": 9600 }, { "epoch": 18.33648393194707, "grad_norm": 0.8163793683052063, "learning_rate": 0.0003, "loss": 0.2958, "step": 9700 }, { "epoch": 18.525519848771268, "grad_norm": 0.8955701589584351, "learning_rate": 0.0003, "loss": 0.3065, "step": 9800 }, { "epoch": 18.714555765595463, "grad_norm": 0.7934162020683289, "learning_rate": 0.0003, "loss": 0.3121, "step": 9900 }, { "epoch": 18.90359168241966, "grad_norm": 1.07945716381073, "learning_rate": 0.0003, "loss": 0.3305, "step": 10000 }, { "epoch": 19.0, "eval_accuracy": 0.5633333333333334, "eval_loss": 2.2682220935821533, "eval_runtime": 5.8794, "eval_samples_per_second": 85.042, "eval_steps_per_second": 10.715, "step": 10051 }, { "epoch": 19.092627599243855, "grad_norm": 0.694956362247467, "learning_rate": 0.0003, "loss": 0.3069, "step": 10100 }, { "epoch": 19.281663516068054, "grad_norm": 0.8144003748893738, "learning_rate": 0.0003, "loss": 0.2879, "step": 10200 }, { "epoch": 19.47069943289225, "grad_norm": 0.7943779826164246, "learning_rate": 0.0003, "loss": 0.3002, "step": 10300 }, { "epoch": 19.659735349716446, "grad_norm": 0.7781754732131958, "learning_rate": 0.0003, "loss": 0.3107, "step": 10400 }, { "epoch": 19.84877126654064, "grad_norm": 0.7696000933647156, "learning_rate": 0.0003, "loss": 0.3228, "step": 10500 }, { "epoch": 20.0, "eval_accuracy": 0.5640512820512821, "eval_loss": 2.253227472305298, "eval_runtime": 5.7574, "eval_samples_per_second": 86.845, "eval_steps_per_second": 10.943, "step": 10580 }, { "epoch": 20.0, "step": 10580, "total_flos": 6.517631969856061e+17, "train_loss": 0.6476908660340625, "train_runtime": 24281.4349, "train_samples_per_second": 13.941, "train_steps_per_second": 0.436 } ], "logging_steps": 100, "max_steps": 10580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 6.517631969856061e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }