{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4, "grad_norm": 0.46826276183128357, "learning_rate": 0.0003, "loss": 2.3159, "step": 100 }, { "epoch": 0.8, "grad_norm": 0.45000985264778137, "learning_rate": 0.0003, "loss": 2.248, "step": 200 }, { "epoch": 1.0, "eval_accuracy": 0.5172698412698413, "eval_loss": 2.3110275268554688, "eval_runtime": 6.3254, "eval_samples_per_second": 79.047, "eval_steps_per_second": 9.96, "step": 250 }, { "epoch": 1.2, "grad_norm": 0.6657848954200745, "learning_rate": 0.0003, "loss": 2.0829, "step": 300 }, { "epoch": 1.6, "grad_norm": 0.7272578477859497, "learning_rate": 0.0003, "loss": 1.9262, "step": 400 }, { "epoch": 2.0, "grad_norm": 0.6784635186195374, "learning_rate": 0.0003, "loss": 1.9103, "step": 500 }, { "epoch": 2.0, "eval_accuracy": 0.5157142857142857, "eval_loss": 2.373993396759033, "eval_runtime": 6.0749, "eval_samples_per_second": 82.306, "eval_steps_per_second": 10.371, "step": 500 }, { "epoch": 2.4, "grad_norm": 0.8736333250999451, "learning_rate": 0.0003, "loss": 1.4576, "step": 600 }, { "epoch": 2.8, "grad_norm": 0.8340785503387451, "learning_rate": 0.0003, "loss": 1.4896, "step": 700 }, { "epoch": 3.0, "eval_accuracy": 0.5112063492063492, "eval_loss": 2.526599645614624, "eval_runtime": 6.302, "eval_samples_per_second": 79.34, "eval_steps_per_second": 9.997, "step": 750 }, { "epoch": 3.2, "grad_norm": 1.0462632179260254, "learning_rate": 0.0003, "loss": 1.2816, "step": 800 }, { "epoch": 3.6, "grad_norm": 1.0726990699768066, "learning_rate": 0.0003, "loss": 1.069, "step": 900 }, { "epoch": 4.0, "grad_norm": 1.2521560192108154, "learning_rate": 0.0003, "loss": 1.109, "step": 1000 }, { "epoch": 4.0, "eval_accuracy": 0.5036507936507937, "eval_loss": 2.7829766273498535, "eval_runtime": 5.9324, "eval_samples_per_second": 84.283, "eval_steps_per_second": 10.62, "step": 1000 }, { "epoch": 4.4, "grad_norm": 1.1222484111785889, "learning_rate": 0.0003, "loss": 0.7303, "step": 1100 }, { "epoch": 4.8, "grad_norm": 1.209559679031372, "learning_rate": 0.0003, "loss": 0.7757, "step": 1200 }, { "epoch": 5.0, "eval_accuracy": 0.49873015873015875, "eval_loss": 3.0311408042907715, "eval_runtime": 6.2342, "eval_samples_per_second": 80.203, "eval_steps_per_second": 10.106, "step": 1250 }, { "epoch": 5.2, "grad_norm": 0.887942373752594, "learning_rate": 0.0003, "loss": 0.6704, "step": 1300 }, { "epoch": 5.6, "grad_norm": 1.0530667304992676, "learning_rate": 0.0003, "loss": 0.5699, "step": 1400 }, { "epoch": 6.0, "grad_norm": 1.0920815467834473, "learning_rate": 0.0003, "loss": 0.5994, "step": 1500 }, { "epoch": 6.0, "eval_accuracy": 0.49787301587301586, "eval_loss": 3.2255795001983643, "eval_runtime": 6.1133, "eval_samples_per_second": 81.789, "eval_steps_per_second": 10.305, "step": 1500 }, { "epoch": 6.4, "grad_norm": 1.010949969291687, "learning_rate": 0.0003, "loss": 0.4637, "step": 1600 }, { "epoch": 6.8, "grad_norm": 0.9567492604255676, "learning_rate": 0.0003, "loss": 0.4921, "step": 1700 }, { "epoch": 7.0, "eval_accuracy": 0.49577777777777776, "eval_loss": 3.351687431335449, "eval_runtime": 5.7924, "eval_samples_per_second": 86.32, "eval_steps_per_second": 10.876, "step": 1750 }, { "epoch": 7.2, "grad_norm": 1.059217929840088, "learning_rate": 0.0003, "loss": 0.4599, "step": 1800 }, { "epoch": 7.6, "grad_norm": 0.8017446994781494, "learning_rate": 0.0003, "loss": 0.4409, "step": 1900 }, { "epoch": 8.0, "grad_norm": 0.9800388813018799, "learning_rate": 0.0003, "loss": 0.4575, "step": 2000 }, { "epoch": 8.0, "eval_accuracy": 0.4946031746031746, "eval_loss": 3.4320859909057617, "eval_runtime": 6.1689, "eval_samples_per_second": 81.051, "eval_steps_per_second": 10.212, "step": 2000 }, { "epoch": 8.4, "grad_norm": 0.6948591470718384, "learning_rate": 0.0003, "loss": 0.4023, "step": 2100 }, { "epoch": 8.8, "grad_norm": 0.6963217258453369, "learning_rate": 0.0003, "loss": 0.4233, "step": 2200 }, { "epoch": 9.0, "eval_accuracy": 0.49612698412698414, "eval_loss": 3.5150551795959473, "eval_runtime": 5.8039, "eval_samples_per_second": 86.148, "eval_steps_per_second": 10.855, "step": 2250 }, { "epoch": 9.2, "grad_norm": 0.7925310730934143, "learning_rate": 0.0003, "loss": 0.4133, "step": 2300 }, { "epoch": 9.6, "grad_norm": 0.680012047290802, "learning_rate": 0.0003, "loss": 0.3997, "step": 2400 }, { "epoch": 10.0, "grad_norm": 0.6718311905860901, "learning_rate": 0.0003, "loss": 0.4178, "step": 2500 }, { "epoch": 10.0, "eval_accuracy": 0.49498412698412697, "eval_loss": 3.5280325412750244, "eval_runtime": 6.0021, "eval_samples_per_second": 83.304, "eval_steps_per_second": 10.496, "step": 2500 }, { "epoch": 10.4, "grad_norm": 0.5879548192024231, "learning_rate": 0.0003, "loss": 0.3814, "step": 2600 }, { "epoch": 10.8, "grad_norm": 0.8837707042694092, "learning_rate": 0.0003, "loss": 0.3987, "step": 2700 }, { "epoch": 11.0, "eval_accuracy": 0.49507936507936506, "eval_loss": 3.5547120571136475, "eval_runtime": 5.9642, "eval_samples_per_second": 83.833, "eval_steps_per_second": 10.563, "step": 2750 }, { "epoch": 11.2, "grad_norm": 0.7036486268043518, "learning_rate": 0.0003, "loss": 0.3947, "step": 2800 }, { "epoch": 11.6, "grad_norm": 0.7022255063056946, "learning_rate": 0.0003, "loss": 0.3886, "step": 2900 }, { "epoch": 12.0, "grad_norm": 0.6789013743400574, "learning_rate": 0.0003, "loss": 0.4033, "step": 3000 }, { "epoch": 12.0, "eval_accuracy": 0.49542857142857144, "eval_loss": 3.560084581375122, "eval_runtime": 6.1898, "eval_samples_per_second": 80.779, "eval_steps_per_second": 10.178, "step": 3000 }, { "epoch": 12.4, "grad_norm": 0.7532069683074951, "learning_rate": 0.0003, "loss": 0.3719, "step": 3100 }, { "epoch": 12.8, "grad_norm": 0.5012597441673279, "learning_rate": 0.0003, "loss": 0.3932, "step": 3200 }, { "epoch": 13.0, "eval_accuracy": 0.4932063492063492, "eval_loss": 3.585875988006592, "eval_runtime": 6.3897, "eval_samples_per_second": 78.251, "eval_steps_per_second": 9.86, "step": 3250 }, { "epoch": 13.2, "grad_norm": 0.6146650910377502, "learning_rate": 0.0003, "loss": 0.3876, "step": 3300 }, { "epoch": 13.6, "grad_norm": 0.6754132509231567, "learning_rate": 0.0003, "loss": 0.382, "step": 3400 }, { "epoch": 14.0, "grad_norm": 0.974744439125061, "learning_rate": 0.0003, "loss": 0.4012, "step": 3500 }, { "epoch": 14.0, "eval_accuracy": 0.49266666666666664, "eval_loss": 3.594428777694702, "eval_runtime": 6.1257, "eval_samples_per_second": 81.623, "eval_steps_per_second": 10.285, "step": 3500 }, { "epoch": 14.4, "grad_norm": 0.6026502847671509, "learning_rate": 0.0003, "loss": 0.3754, "step": 3600 }, { "epoch": 14.8, "grad_norm": 0.8706900477409363, "learning_rate": 0.0003, "loss": 0.3895, "step": 3700 }, { "epoch": 15.0, "eval_accuracy": 0.4939047619047619, "eval_loss": 3.6037862300872803, "eval_runtime": 6.314, "eval_samples_per_second": 79.189, "eval_steps_per_second": 9.978, "step": 3750 }, { "epoch": 15.2, "grad_norm": 0.7156828045845032, "learning_rate": 0.0003, "loss": 0.385, "step": 3800 }, { "epoch": 15.6, "grad_norm": 0.8976225256919861, "learning_rate": 0.0003, "loss": 0.3792, "step": 3900 }, { "epoch": 16.0, "grad_norm": 0.765393853187561, "learning_rate": 0.0003, "loss": 0.396, "step": 4000 }, { "epoch": 16.0, "eval_accuracy": 0.49323809523809525, "eval_loss": 3.650381326675415, "eval_runtime": 5.7803, "eval_samples_per_second": 86.501, "eval_steps_per_second": 10.899, "step": 4000 }, { "epoch": 16.4, "grad_norm": 0.5230117440223694, "learning_rate": 0.0003, "loss": 0.3683, "step": 4100 }, { "epoch": 16.8, "grad_norm": 0.676298201084137, "learning_rate": 0.0003, "loss": 0.3847, "step": 4200 }, { "epoch": 17.0, "eval_accuracy": 0.49117460317460315, "eval_loss": 3.66017746925354, "eval_runtime": 6.0862, "eval_samples_per_second": 82.154, "eval_steps_per_second": 10.351, "step": 4250 }, { "epoch": 17.2, "grad_norm": 0.8566504120826721, "learning_rate": 0.0003, "loss": 0.3836, "step": 4300 }, { "epoch": 17.6, "grad_norm": 0.8317088484764099, "learning_rate": 0.0003, "loss": 0.376, "step": 4400 }, { "epoch": 18.0, "grad_norm": 0.5270309448242188, "learning_rate": 0.0003, "loss": 0.3942, "step": 4500 }, { "epoch": 18.0, "eval_accuracy": 0.49142857142857144, "eval_loss": 3.651546001434326, "eval_runtime": 6.2927, "eval_samples_per_second": 79.458, "eval_steps_per_second": 10.012, "step": 4500 }, { "epoch": 18.4, "grad_norm": 0.587833821773529, "learning_rate": 0.0003, "loss": 0.3648, "step": 4600 }, { "epoch": 18.8, "grad_norm": 0.7488328814506531, "learning_rate": 0.0003, "loss": 0.3809, "step": 4700 }, { "epoch": 19.0, "eval_accuracy": 0.49228571428571427, "eval_loss": 3.7304036617279053, "eval_runtime": 6.0944, "eval_samples_per_second": 82.043, "eval_steps_per_second": 10.337, "step": 4750 }, { "epoch": 19.2, "grad_norm": 0.4905204176902771, "learning_rate": 0.0003, "loss": 0.3715, "step": 4800 }, { "epoch": 19.6, "grad_norm": 0.6574224829673767, "learning_rate": 0.0003, "loss": 0.3701, "step": 4900 }, { "epoch": 20.0, "grad_norm": 0.7665922045707703, "learning_rate": 0.0003, "loss": 0.3805, "step": 5000 }, { "epoch": 20.0, "eval_accuracy": 0.49165079365079367, "eval_loss": 3.6823432445526123, "eval_runtime": 6.2644, "eval_samples_per_second": 79.816, "eval_steps_per_second": 10.057, "step": 5000 }, { "epoch": 20.0, "step": 5000, "total_flos": 1.293538587312128e+17, "train_loss": 0.6841725708007812, "train_runtime": 11846.7386, "train_samples_per_second": 13.506, "train_steps_per_second": 0.422 } ], "logging_steps": 100, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 1.293538587312128e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }