{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4, "grad_norm": 0.1675417125225067, "learning_rate": 3e-05, "loss": 1.5895, "step": 100 }, { "epoch": 0.8, "grad_norm": 0.1831749528646469, "learning_rate": 3e-05, "loss": 1.4742, "step": 200 }, { "epoch": 1.0, "eval_accuracy": 0.6748151382823872, "eval_loss": 1.531341314315796, "eval_runtime": 9.1549, "eval_samples_per_second": 54.616, "eval_steps_per_second": 6.882, "step": 250 }, { "epoch": 1.2, "grad_norm": 0.22951605916023254, "learning_rate": 3e-05, "loss": 1.4652, "step": 300 }, { "epoch": 1.6, "grad_norm": 0.23647047579288483, "learning_rate": 3e-05, "loss": 1.4601, "step": 400 }, { "epoch": 2.0, "grad_norm": 0.2248772829771042, "learning_rate": 3e-05, "loss": 1.45, "step": 500 }, { "epoch": 2.0, "eval_accuracy": 0.6756593886462882, "eval_loss": 1.5196012258529663, "eval_runtime": 9.0923, "eval_samples_per_second": 54.992, "eval_steps_per_second": 6.929, "step": 500 }, { "epoch": 2.4, "grad_norm": 0.26751402020454407, "learning_rate": 3e-05, "loss": 1.4361, "step": 600 }, { "epoch": 2.8, "grad_norm": 0.29390770196914673, "learning_rate": 3e-05, "loss": 1.4269, "step": 700 }, { "epoch": 3.0, "eval_accuracy": 0.6760756914119359, "eval_loss": 1.5133788585662842, "eval_runtime": 9.1109, "eval_samples_per_second": 54.879, "eval_steps_per_second": 6.915, "step": 750 }, { "epoch": 3.2, "grad_norm": 0.3322733938694, "learning_rate": 3e-05, "loss": 1.4184, "step": 800 }, { "epoch": 3.6, "grad_norm": 0.3646068871021271, "learning_rate": 3e-05, "loss": 1.3973, "step": 900 }, { "epoch": 4.0, "grad_norm": 0.4150474965572357, "learning_rate": 3e-05, "loss": 1.3999, "step": 1000 }, { "epoch": 4.0, "eval_accuracy": 0.6762474526928676, "eval_loss": 1.511995792388916, "eval_runtime": 9.0922, "eval_samples_per_second": 54.992, "eval_steps_per_second": 6.929, "step": 1000 }, { "epoch": 4.4, "grad_norm": 0.44478657841682434, "learning_rate": 3e-05, "loss": 1.3624, "step": 1100 }, { "epoch": 4.8, "grad_norm": 0.48866602778434753, "learning_rate": 3e-05, "loss": 1.3614, "step": 1200 }, { "epoch": 5.0, "eval_accuracy": 0.6759650655021834, "eval_loss": 1.5192290544509888, "eval_runtime": 9.1143, "eval_samples_per_second": 54.859, "eval_steps_per_second": 6.912, "step": 1250 }, { "epoch": 5.2, "grad_norm": 0.5195505023002625, "learning_rate": 3e-05, "loss": 1.3431, "step": 1300 }, { "epoch": 5.6, "grad_norm": 0.5769343972206116, "learning_rate": 3e-05, "loss": 1.3264, "step": 1400 }, { "epoch": 6.0, "grad_norm": 0.5181849598884583, "learning_rate": 3e-05, "loss": 1.3303, "step": 1500 }, { "epoch": 6.0, "eval_accuracy": 0.6755080058224163, "eval_loss": 1.5265752077102661, "eval_runtime": 9.1234, "eval_samples_per_second": 54.804, "eval_steps_per_second": 6.905, "step": 1500 }, { "epoch": 6.4, "grad_norm": 0.6372528076171875, "learning_rate": 3e-05, "loss": 1.2883, "step": 1600 }, { "epoch": 6.8, "grad_norm": 0.6501044034957886, "learning_rate": 3e-05, "loss": 1.2946, "step": 1700 }, { "epoch": 7.0, "eval_accuracy": 0.6747045123726346, "eval_loss": 1.5446096658706665, "eval_runtime": 9.0797, "eval_samples_per_second": 55.068, "eval_steps_per_second": 6.939, "step": 1750 }, { "epoch": 7.2, "grad_norm": 0.7209816575050354, "learning_rate": 3e-05, "loss": 1.2705, "step": 1800 }, { "epoch": 7.6, "grad_norm": 0.7495877742767334, "learning_rate": 3e-05, "loss": 1.2498, "step": 1900 }, { "epoch": 8.0, "grad_norm": 0.681526780128479, "learning_rate": 3e-05, "loss": 1.2518, "step": 2000 }, { "epoch": 8.0, "eval_accuracy": 0.6745036390101893, "eval_loss": 1.5590205192565918, "eval_runtime": 8.0597, "eval_samples_per_second": 62.037, "eval_steps_per_second": 7.817, "step": 2000 }, { "epoch": 8.4, "grad_norm": 0.7470565438270569, "learning_rate": 3e-05, "loss": 1.2196, "step": 2100 }, { "epoch": 8.8, "grad_norm": 0.7745229005813599, "learning_rate": 3e-05, "loss": 1.2082, "step": 2200 }, { "epoch": 9.0, "eval_accuracy": 0.6740349344978166, "eval_loss": 1.571682333946228, "eval_runtime": 9.0961, "eval_samples_per_second": 54.969, "eval_steps_per_second": 6.926, "step": 2250 }, { "epoch": 9.2, "grad_norm": 0.8478706479072571, "learning_rate": 3e-05, "loss": 1.2017, "step": 2300 }, { "epoch": 9.6, "grad_norm": 0.9340612292289734, "learning_rate": 3e-05, "loss": 1.1742, "step": 2400 }, { "epoch": 10.0, "grad_norm": 0.9207039475440979, "learning_rate": 3e-05, "loss": 1.19, "step": 2500 }, { "epoch": 10.0, "eval_accuracy": 0.6727074235807861, "eval_loss": 1.6021865606307983, "eval_runtime": 9.0984, "eval_samples_per_second": 54.955, "eval_steps_per_second": 6.924, "step": 2500 }, { "epoch": 10.4, "grad_norm": 0.9244349598884583, "learning_rate": 3e-05, "loss": 1.1299, "step": 2600 }, { "epoch": 10.8, "grad_norm": 0.9110805988311768, "learning_rate": 3e-05, "loss": 1.1523, "step": 2700 }, { "epoch": 11.0, "eval_accuracy": 0.672608442503639, "eval_loss": 1.6098225116729736, "eval_runtime": 9.117, "eval_samples_per_second": 54.843, "eval_steps_per_second": 6.91, "step": 2750 }, { "epoch": 11.2, "grad_norm": 0.916287899017334, "learning_rate": 3e-05, "loss": 1.1278, "step": 2800 }, { "epoch": 11.6, "grad_norm": 1.0008416175842285, "learning_rate": 3e-05, "loss": 1.0981, "step": 2900 }, { "epoch": 12.0, "grad_norm": 0.9763438701629639, "learning_rate": 3e-05, "loss": 1.1193, "step": 3000 }, { "epoch": 12.0, "eval_accuracy": 0.671589519650655, "eval_loss": 1.6344681978225708, "eval_runtime": 9.1836, "eval_samples_per_second": 54.445, "eval_steps_per_second": 6.86, "step": 3000 }, { "epoch": 12.4, "grad_norm": 1.0682412385940552, "learning_rate": 3e-05, "loss": 1.0604, "step": 3100 }, { "epoch": 12.8, "grad_norm": 1.1306017637252808, "learning_rate": 3e-05, "loss": 1.0736, "step": 3200 }, { "epoch": 13.0, "eval_accuracy": 0.6707016011644833, "eval_loss": 1.674833059310913, "eval_runtime": 8.098, "eval_samples_per_second": 61.744, "eval_steps_per_second": 7.78, "step": 3250 }, { "epoch": 13.2, "grad_norm": 1.154135823249817, "learning_rate": 3e-05, "loss": 1.054, "step": 3300 }, { "epoch": 13.6, "grad_norm": 1.1169854402542114, "learning_rate": 3e-05, "loss": 1.0253, "step": 3400 }, { "epoch": 14.0, "grad_norm": 1.0877137184143066, "learning_rate": 3e-05, "loss": 1.0414, "step": 3500 }, { "epoch": 14.0, "eval_accuracy": 0.6701280931586608, "eval_loss": 1.688016653060913, "eval_runtime": 9.1712, "eval_samples_per_second": 54.519, "eval_steps_per_second": 6.869, "step": 3500 }, { "epoch": 14.4, "grad_norm": 1.2245118618011475, "learning_rate": 3e-05, "loss": 0.9823, "step": 3600 }, { "epoch": 14.8, "grad_norm": 1.2784464359283447, "learning_rate": 3e-05, "loss": 1.0069, "step": 3700 }, { "epoch": 15.0, "eval_accuracy": 0.6693682678311499, "eval_loss": 1.7182435989379883, "eval_runtime": 9.1433, "eval_samples_per_second": 54.685, "eval_steps_per_second": 6.89, "step": 3750 }, { "epoch": 15.2, "grad_norm": 1.182626724243164, "learning_rate": 3e-05, "loss": 0.9834, "step": 3800 }, { "epoch": 15.6, "grad_norm": 1.3419315814971924, "learning_rate": 3e-05, "loss": 0.9608, "step": 3900 }, { "epoch": 16.0, "grad_norm": 1.2352997064590454, "learning_rate": 3e-05, "loss": 0.9654, "step": 4000 }, { "epoch": 16.0, "eval_accuracy": 0.6685036390101893, "eval_loss": 1.7521902322769165, "eval_runtime": 9.1045, "eval_samples_per_second": 54.918, "eval_steps_per_second": 6.92, "step": 4000 }, { "epoch": 16.4, "grad_norm": 1.439382791519165, "learning_rate": 3e-05, "loss": 0.9236, "step": 4100 }, { "epoch": 16.8, "grad_norm": 1.3681950569152832, "learning_rate": 3e-05, "loss": 0.9337, "step": 4200 }, { "epoch": 17.0, "eval_accuracy": 0.6677409024745269, "eval_loss": 1.7825894355773926, "eval_runtime": 9.1188, "eval_samples_per_second": 54.832, "eval_steps_per_second": 6.909, "step": 4250 }, { "epoch": 17.2, "grad_norm": 1.417385458946228, "learning_rate": 3e-05, "loss": 0.9135, "step": 4300 }, { "epoch": 17.6, "grad_norm": 1.5673705339431763, "learning_rate": 3e-05, "loss": 0.8941, "step": 4400 }, { "epoch": 18.0, "grad_norm": 1.5578211545944214, "learning_rate": 3e-05, "loss": 0.9, "step": 4500 }, { "epoch": 18.0, "eval_accuracy": 0.6671615720524018, "eval_loss": 1.8080195188522339, "eval_runtime": 8.0689, "eval_samples_per_second": 61.967, "eval_steps_per_second": 7.808, "step": 4500 }, { "epoch": 18.4, "grad_norm": 1.671644687652588, "learning_rate": 3e-05, "loss": 0.8522, "step": 4600 }, { "epoch": 18.8, "grad_norm": 1.5266335010528564, "learning_rate": 3e-05, "loss": 0.8704, "step": 4700 }, { "epoch": 19.0, "eval_accuracy": 0.6663289665211063, "eval_loss": 1.8349848985671997, "eval_runtime": 9.1076, "eval_samples_per_second": 54.899, "eval_steps_per_second": 6.917, "step": 4750 }, { "epoch": 19.2, "grad_norm": 1.5969411134719849, "learning_rate": 3e-05, "loss": 0.8469, "step": 4800 }, { "epoch": 19.6, "grad_norm": 1.5390815734863281, "learning_rate": 3e-05, "loss": 0.8398, "step": 4900 }, { "epoch": 20.0, "grad_norm": 1.5651904344558716, "learning_rate": 3e-05, "loss": 0.8407, "step": 5000 }, { "epoch": 20.0, "eval_accuracy": 0.6657583697234353, "eval_loss": 1.8696147203445435, "eval_runtime": 9.1023, "eval_samples_per_second": 54.931, "eval_steps_per_second": 6.921, "step": 5000 }, { "epoch": 20.0, "step": 5000, "total_flos": 9.221411586147615e+17, "train_loss": 1.163707649230957, "train_runtime": 11749.7477, "train_samples_per_second": 13.617, "train_steps_per_second": 0.426 } ], "logging_steps": 100, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 9.221411586147615e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }