{ "best_metric": 0.7887323943661972, "best_model_checkpoint": "./model_out/checkpoint-480", "epoch": 15.737704918032787, "eval_steps": 60, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32786885245901637, "grad_norm": 3.083059549331665, "learning_rate": 9.833333333333333e-05, "loss": 1.3429, "step": 10 }, { "epoch": 0.6557377049180327, "grad_norm": 5.27333927154541, "learning_rate": 9.666666666666667e-05, "loss": 0.9792, "step": 20 }, { "epoch": 0.9836065573770492, "grad_norm": 6.412991046905518, "learning_rate": 9.5e-05, "loss": 0.8173, "step": 30 }, { "epoch": 1.3114754098360657, "grad_norm": 4.924794673919678, "learning_rate": 9.333333333333334e-05, "loss": 0.5787, "step": 40 }, { "epoch": 1.639344262295082, "grad_norm": 7.327770233154297, "learning_rate": 9.166666666666667e-05, "loss": 0.3744, "step": 50 }, { "epoch": 1.9672131147540983, "grad_norm": 8.928845405578613, "learning_rate": 9e-05, "loss": 0.4815, "step": 60 }, { "epoch": 1.9672131147540983, "eval_accuracy": 0.7897310513447433, "eval_f1": 0.6728971962616822, "eval_loss": 0.5841665267944336, "eval_precision": 0.6666666666666666, "eval_recall": 0.6792452830188679, "eval_runtime": 0.9794, "eval_samples_per_second": 61.26, "eval_steps_per_second": 8.168, "step": 60 }, { "epoch": 2.2950819672131146, "grad_norm": 5.491725444793701, "learning_rate": 8.833333333333333e-05, "loss": 0.2658, "step": 70 }, { "epoch": 2.6229508196721314, "grad_norm": 5.113138675689697, "learning_rate": 8.666666666666667e-05, "loss": 0.2402, "step": 80 }, { "epoch": 2.9508196721311473, "grad_norm": 6.449514865875244, "learning_rate": 8.5e-05, "loss": 0.1607, "step": 90 }, { "epoch": 3.278688524590164, "grad_norm": 5.699795246124268, "learning_rate": 8.333333333333334e-05, "loss": 0.0852, "step": 100 }, { "epoch": 3.6065573770491803, "grad_norm": 3.382863998413086, "learning_rate": 8.166666666666667e-05, "loss": 0.1513, "step": 110 }, { "epoch": 3.9344262295081966, "grad_norm": 2.5515336990356445, "learning_rate": 8e-05, "loss": 0.2612, "step": 120 }, { "epoch": 3.9344262295081966, "eval_accuracy": 0.8166259168704156, "eval_f1": 0.7009345794392523, "eval_loss": 0.7145981192588806, "eval_precision": 0.6944444444444444, "eval_recall": 0.7075471698113207, "eval_runtime": 0.9765, "eval_samples_per_second": 61.442, "eval_steps_per_second": 8.192, "step": 120 }, { "epoch": 4.262295081967213, "grad_norm": 3.4330668449401855, "learning_rate": 7.833333333333333e-05, "loss": 0.0857, "step": 130 }, { "epoch": 4.590163934426229, "grad_norm": 0.7570230960845947, "learning_rate": 7.666666666666667e-05, "loss": 0.0339, "step": 140 }, { "epoch": 4.918032786885246, "grad_norm": 13.973723411560059, "learning_rate": 7.500000000000001e-05, "loss": 0.0939, "step": 150 }, { "epoch": 5.245901639344262, "grad_norm": 6.072541236877441, "learning_rate": 7.333333333333333e-05, "loss": 0.0753, "step": 160 }, { "epoch": 5.573770491803279, "grad_norm": 6.689024925231934, "learning_rate": 7.166666666666667e-05, "loss": 0.0497, "step": 170 }, { "epoch": 5.901639344262295, "grad_norm": 2.9444985389709473, "learning_rate": 7e-05, "loss": 0.0535, "step": 180 }, { "epoch": 5.901639344262295, "eval_accuracy": 0.8166259168704156, "eval_f1": 0.6697247706422018, "eval_loss": 0.7874237298965454, "eval_precision": 0.6517857142857143, "eval_recall": 0.6886792452830188, "eval_runtime": 0.9781, "eval_samples_per_second": 61.346, "eval_steps_per_second": 8.179, "step": 180 }, { "epoch": 6.229508196721311, "grad_norm": 1.6225782632827759, "learning_rate": 6.833333333333333e-05, "loss": 0.0258, "step": 190 }, { "epoch": 6.557377049180328, "grad_norm": 0.07454058527946472, "learning_rate": 6.666666666666667e-05, "loss": 0.0273, "step": 200 }, { "epoch": 6.885245901639344, "grad_norm": 5.249931812286377, "learning_rate": 6.500000000000001e-05, "loss": 0.023, "step": 210 }, { "epoch": 7.213114754098361, "grad_norm": 0.14014603197574615, "learning_rate": 6.333333333333333e-05, "loss": 0.0162, "step": 220 }, { "epoch": 7.540983606557377, "grad_norm": 0.024733418598771095, "learning_rate": 6.166666666666667e-05, "loss": 0.0099, "step": 230 }, { "epoch": 7.868852459016393, "grad_norm": 1.4897031784057617, "learning_rate": 6e-05, "loss": 0.0141, "step": 240 }, { "epoch": 7.868852459016393, "eval_accuracy": 0.8557457212713936, "eval_f1": 0.780269058295964, "eval_loss": 0.8452157378196716, "eval_precision": 0.7435897435897436, "eval_recall": 0.8207547169811321, "eval_runtime": 0.9777, "eval_samples_per_second": 61.371, "eval_steps_per_second": 8.183, "step": 240 }, { "epoch": 8.19672131147541, "grad_norm": 6.675982475280762, "learning_rate": 5.833333333333334e-05, "loss": 0.0201, "step": 250 }, { "epoch": 8.524590163934427, "grad_norm": 0.21795310080051422, "learning_rate": 5.666666666666667e-05, "loss": 0.0098, "step": 260 }, { "epoch": 8.852459016393443, "grad_norm": 1.0106713771820068, "learning_rate": 5.500000000000001e-05, "loss": 0.0162, "step": 270 }, { "epoch": 9.180327868852459, "grad_norm": 1.9501103162765503, "learning_rate": 5.333333333333333e-05, "loss": 0.0166, "step": 280 }, { "epoch": 9.508196721311476, "grad_norm": 0.09368854761123657, "learning_rate": 5.166666666666667e-05, "loss": 0.0178, "step": 290 }, { "epoch": 9.836065573770492, "grad_norm": 5.705907821655273, "learning_rate": 5e-05, "loss": 0.0102, "step": 300 }, { "epoch": 9.836065573770492, "eval_accuracy": 0.8068459657701712, "eval_f1": 0.7296137339055794, "eval_loss": 1.098240613937378, "eval_precision": 0.6692913385826772, "eval_recall": 0.8018867924528302, "eval_runtime": 0.9802, "eval_samples_per_second": 61.211, "eval_steps_per_second": 8.161, "step": 300 }, { "epoch": 10.163934426229508, "grad_norm": 0.016423122957348824, "learning_rate": 4.8333333333333334e-05, "loss": 0.0048, "step": 310 }, { "epoch": 10.491803278688524, "grad_norm": 0.013950828462839127, "learning_rate": 4.666666666666667e-05, "loss": 0.0023, "step": 320 }, { "epoch": 10.819672131147541, "grad_norm": 0.0043854909017682076, "learning_rate": 4.5e-05, "loss": 0.0049, "step": 330 }, { "epoch": 11.147540983606557, "grad_norm": 0.5211315751075745, "learning_rate": 4.3333333333333334e-05, "loss": 0.0218, "step": 340 }, { "epoch": 11.475409836065573, "grad_norm": 0.0140004837885499, "learning_rate": 4.166666666666667e-05, "loss": 0.0085, "step": 350 }, { "epoch": 11.80327868852459, "grad_norm": 1.8254053592681885, "learning_rate": 4e-05, "loss": 0.0099, "step": 360 }, { "epoch": 11.80327868852459, "eval_accuracy": 0.80440097799511, "eval_f1": 0.7203791469194313, "eval_loss": 1.3635536432266235, "eval_precision": 0.7238095238095238, "eval_recall": 0.7169811320754716, "eval_runtime": 0.9823, "eval_samples_per_second": 61.081, "eval_steps_per_second": 8.144, "step": 360 }, { "epoch": 12.131147540983607, "grad_norm": 0.1549140065908432, "learning_rate": 3.8333333333333334e-05, "loss": 0.0004, "step": 370 }, { "epoch": 12.459016393442623, "grad_norm": 4.475136756896973, "learning_rate": 3.6666666666666666e-05, "loss": 0.0026, "step": 380 }, { "epoch": 12.78688524590164, "grad_norm": 11.31729507446289, "learning_rate": 3.5e-05, "loss": 0.0024, "step": 390 }, { "epoch": 13.114754098360656, "grad_norm": 0.003214745782315731, "learning_rate": 3.3333333333333335e-05, "loss": 0.0012, "step": 400 }, { "epoch": 13.442622950819672, "grad_norm": 0.9436277747154236, "learning_rate": 3.1666666666666666e-05, "loss": 0.0145, "step": 410 }, { "epoch": 13.770491803278688, "grad_norm": 5.284872055053711, "learning_rate": 3e-05, "loss": 0.0087, "step": 420 }, { "epoch": 13.770491803278688, "eval_accuracy": 0.8166259168704156, "eval_f1": 0.7751196172248804, "eval_loss": 1.346622347831726, "eval_precision": 0.7864077669902912, "eval_recall": 0.7641509433962265, "eval_runtime": 0.9819, "eval_samples_per_second": 61.108, "eval_steps_per_second": 8.148, "step": 420 }, { "epoch": 14.098360655737705, "grad_norm": 0.003362592775374651, "learning_rate": 2.8333333333333335e-05, "loss": 0.0011, "step": 430 }, { "epoch": 14.426229508196721, "grad_norm": 0.011366785503923893, "learning_rate": 2.6666666666666667e-05, "loss": 0.0003, "step": 440 }, { "epoch": 14.754098360655737, "grad_norm": 0.003698177170008421, "learning_rate": 2.5e-05, "loss": 0.0002, "step": 450 }, { "epoch": 15.081967213114755, "grad_norm": 0.0021139541640877724, "learning_rate": 2.3333333333333336e-05, "loss": 0.0047, "step": 460 }, { "epoch": 15.40983606557377, "grad_norm": 0.0036122933961451054, "learning_rate": 2.1666666666666667e-05, "loss": 0.0002, "step": 470 }, { "epoch": 15.737704918032787, "grad_norm": 0.010879104025661945, "learning_rate": 2e-05, "loss": 0.0002, "step": 480 }, { "epoch": 15.737704918032787, "eval_accuracy": 0.8288508557457213, "eval_f1": 0.7887323943661972, "eval_loss": 1.271989107131958, "eval_precision": 0.7850467289719626, "eval_recall": 0.7924528301886793, "eval_runtime": 0.9809, "eval_samples_per_second": 61.168, "eval_steps_per_second": 8.156, "step": 480 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 60, "total_flos": 995567645030400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }