{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "global_step": 11448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "learning_rate": 2.0000000000000003e-06, "loss": 87.2627, "step": 100 }, { "epoch": 0.14, "learning_rate": 4.000000000000001e-06, "loss": 52.4123, "step": 200 }, { "epoch": 0.21, "learning_rate": 6e-06, "loss": 41.3932, "step": 300 }, { "epoch": 0.28, "learning_rate": 8.000000000000001e-06, "loss": 36.7185, "step": 400 }, { "epoch": 0.35, "learning_rate": 1e-05, "loss": 33.4597, "step": 500 }, { "epoch": 0.42, "learning_rate": 1.2e-05, "loss": 31.1323, "step": 600 }, { "epoch": 0.49, "learning_rate": 1.4000000000000001e-05, "loss": 28.9204, "step": 700 }, { "epoch": 0.56, "learning_rate": 1.6000000000000003e-05, "loss": 27.0128, "step": 800 }, { "epoch": 0.63, "learning_rate": 1.8e-05, "loss": 25.1703, "step": 900 }, { "epoch": 0.7, "learning_rate": 2e-05, "loss": 23.5486, "step": 1000 }, { "epoch": 0.77, "learning_rate": 2.2000000000000003e-05, "loss": 21.8419, "step": 1100 }, { "epoch": 0.84, "learning_rate": 2.4e-05, "loss": 20.3387, "step": 1200 }, { "epoch": 0.91, "learning_rate": 2.6000000000000002e-05, "loss": 18.7216, "step": 1300 }, { "epoch": 0.98, "learning_rate": 2.8000000000000003e-05, "loss": 16.7862, "step": 1400 }, { "epoch": 1.05, "learning_rate": 3e-05, "loss": 15.034, "step": 1500 }, { "epoch": 1.12, "learning_rate": 3.2000000000000005e-05, "loss": 13.3747, "step": 1600 }, { "epoch": 1.19, "learning_rate": 3.4000000000000007e-05, "loss": 11.7581, "step": 1700 }, { "epoch": 1.26, "learning_rate": 3.6e-05, "loss": 9.9702, "step": 1800 }, { "epoch": 1.33, "learning_rate": 3.8e-05, "loss": 8.3363, "step": 1900 }, { "epoch": 1.4, "learning_rate": 4e-05, "loss": 6.9199, "step": 2000 }, { "epoch": 1.4, "eval_bleu": 3.4801, "eval_em": 0.0, "eval_gen_len": 220.966, "eval_loss": 6.3751444816589355, "eval_runtime": 1604.7901, "eval_samples_per_second": 0.751, "eval_steps_per_second": 0.094, "step": 2000 }, { "epoch": 1.47, "learning_rate": 4.2e-05, "loss": 6.2524, "step": 2100 }, { "epoch": 1.54, "learning_rate": 4.4000000000000006e-05, "loss": 5.8029, "step": 2200 }, { "epoch": 1.61, "learning_rate": 4.600000000000001e-05, "loss": 5.606, "step": 2300 }, { "epoch": 1.68, "learning_rate": 4.8e-05, "loss": 5.3757, "step": 2400 }, { "epoch": 1.75, "learning_rate": 5e-05, "loss": 5.1043, "step": 2500 }, { "epoch": 1.82, "learning_rate": 4.944121591417077e-05, "loss": 4.9414, "step": 2600 }, { "epoch": 1.89, "learning_rate": 4.888243182834153e-05, "loss": 4.7381, "step": 2700 }, { "epoch": 1.96, "learning_rate": 4.8323647742512295e-05, "loss": 4.6214, "step": 2800 }, { "epoch": 2.03, "learning_rate": 4.776486365668306e-05, "loss": 4.2971, "step": 2900 }, { "epoch": 2.1, "learning_rate": 4.720607957085382e-05, "loss": 4.1602, "step": 3000 }, { "epoch": 2.17, "learning_rate": 4.664729548502459e-05, "loss": 4.0391, "step": 3100 }, { "epoch": 2.24, "learning_rate": 4.6088511399195353e-05, "loss": 3.9211, "step": 3200 }, { "epoch": 2.31, "learning_rate": 4.552972731336611e-05, "loss": 3.7642, "step": 3300 }, { "epoch": 2.38, "learning_rate": 4.497094322753688e-05, "loss": 3.6698, "step": 3400 }, { "epoch": 2.45, "learning_rate": 4.4412159141707646e-05, "loss": 3.5409, "step": 3500 }, { "epoch": 2.52, "learning_rate": 4.385337505587841e-05, "loss": 3.4016, "step": 3600 }, { "epoch": 2.59, "learning_rate": 4.329459097004918e-05, "loss": 3.2761, "step": 3700 }, { "epoch": 2.66, "learning_rate": 4.2735806884219945e-05, "loss": 3.1708, "step": 3800 }, { "epoch": 2.73, "learning_rate": 4.2177022798390704e-05, "loss": 3.0849, "step": 3900 }, { "epoch": 2.8, "learning_rate": 4.161823871256147e-05, "loss": 3.0222, "step": 4000 }, { "epoch": 2.8, "eval_bleu": 27.8543, "eval_em": 0.0, "eval_gen_len": 36.8, "eval_loss": 2.8796441555023193, "eval_runtime": 501.7382, "eval_samples_per_second": 2.402, "eval_steps_per_second": 0.301, "step": 4000 }, { "epoch": 2.87, "learning_rate": 4.105945462673223e-05, "loss": 2.9079, "step": 4100 }, { "epoch": 2.94, "learning_rate": 4.0500670540903e-05, "loss": 2.8688, "step": 4200 }, { "epoch": 3.0, "learning_rate": 3.994188645507376e-05, "loss": 2.7398, "step": 4300 }, { "epoch": 3.07, "learning_rate": 3.938310236924452e-05, "loss": 2.5503, "step": 4400 }, { "epoch": 3.14, "learning_rate": 3.882431828341529e-05, "loss": 2.5121, "step": 4500 }, { "epoch": 3.21, "learning_rate": 3.8265534197586055e-05, "loss": 2.4625, "step": 4600 }, { "epoch": 3.28, "learning_rate": 3.7706750111756815e-05, "loss": 2.3833, "step": 4700 }, { "epoch": 3.35, "learning_rate": 3.714796602592758e-05, "loss": 2.3133, "step": 4800 }, { "epoch": 3.42, "learning_rate": 3.658918194009835e-05, "loss": 2.2152, "step": 4900 }, { "epoch": 3.49, "learning_rate": 3.603039785426911e-05, "loss": 2.2304, "step": 5000 }, { "epoch": 3.56, "learning_rate": 3.5471613768439874e-05, "loss": 2.1563, "step": 5100 }, { "epoch": 3.63, "learning_rate": 3.491282968261064e-05, "loss": 2.0991, "step": 5200 }, { "epoch": 3.7, "learning_rate": 3.4354045596781406e-05, "loss": 2.0702, "step": 5300 }, { "epoch": 3.77, "learning_rate": 3.379526151095217e-05, "loss": 1.9914, "step": 5400 }, { "epoch": 3.84, "learning_rate": 3.323647742512294e-05, "loss": 1.9235, "step": 5500 }, { "epoch": 3.91, "learning_rate": 3.26776933392937e-05, "loss": 1.8922, "step": 5600 }, { "epoch": 3.98, "learning_rate": 3.2118909253464465e-05, "loss": 1.8512, "step": 5700 }, { "epoch": 4.05, "learning_rate": 3.156012516763523e-05, "loss": 1.6966, "step": 5800 }, { "epoch": 4.12, "learning_rate": 3.100134108180599e-05, "loss": 1.6399, "step": 5900 }, { "epoch": 4.19, "learning_rate": 3.0442556995976757e-05, "loss": 1.5982, "step": 6000 }, { "epoch": 4.19, "eval_bleu": 56.0747, "eval_em": 0.0017, "eval_gen_len": 43.9021, "eval_loss": 1.7495189905166626, "eval_runtime": 423.2589, "eval_samples_per_second": 2.847, "eval_steps_per_second": 0.357, "step": 6000 }, { "epoch": 4.26, "learning_rate": 2.9883772910147524e-05, "loss": 1.5968, "step": 6100 }, { "epoch": 4.33, "learning_rate": 2.9324988824318283e-05, "loss": 1.5813, "step": 6200 }, { "epoch": 4.4, "learning_rate": 2.876620473848905e-05, "loss": 1.5421, "step": 6300 }, { "epoch": 4.47, "learning_rate": 2.8207420652659816e-05, "loss": 1.4852, "step": 6400 }, { "epoch": 4.54, "learning_rate": 2.7648636566830576e-05, "loss": 1.5027, "step": 6500 }, { "epoch": 4.61, "learning_rate": 2.7089852481001342e-05, "loss": 1.4951, "step": 6600 }, { "epoch": 4.68, "learning_rate": 2.653106839517211e-05, "loss": 1.4803, "step": 6700 }, { "epoch": 4.75, "learning_rate": 2.597228430934287e-05, "loss": 1.4127, "step": 6800 }, { "epoch": 4.82, "learning_rate": 2.5413500223513638e-05, "loss": 1.3896, "step": 6900 }, { "epoch": 4.89, "learning_rate": 2.48547161376844e-05, "loss": 1.3656, "step": 7000 }, { "epoch": 4.96, "learning_rate": 2.4295932051855164e-05, "loss": 1.3432, "step": 7100 }, { "epoch": 5.03, "learning_rate": 2.373714796602593e-05, "loss": 1.2224, "step": 7200 }, { "epoch": 5.1, "learning_rate": 2.3178363880196693e-05, "loss": 1.1396, "step": 7300 }, { "epoch": 5.17, "learning_rate": 2.2619579794367456e-05, "loss": 1.1475, "step": 7400 }, { "epoch": 5.24, "learning_rate": 2.206079570853822e-05, "loss": 1.0669, "step": 7500 }, { "epoch": 5.31, "learning_rate": 2.1502011622708985e-05, "loss": 1.1356, "step": 7600 }, { "epoch": 5.38, "learning_rate": 2.0943227536879752e-05, "loss": 1.0965, "step": 7700 }, { "epoch": 5.45, "learning_rate": 2.0384443451050515e-05, "loss": 1.1086, "step": 7800 }, { "epoch": 5.52, "learning_rate": 1.982565936522128e-05, "loss": 1.0642, "step": 7900 }, { "epoch": 5.59, "learning_rate": 1.9266875279392044e-05, "loss": 1.0717, "step": 8000 }, { "epoch": 5.59, "eval_bleu": 69.9606, "eval_em": 0.0199, "eval_gen_len": 46.0722, "eval_loss": 1.26251220703125, "eval_runtime": 355.6397, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.425, "step": 8000 }, { "epoch": 5.66, "learning_rate": 1.8708091193562807e-05, "loss": 1.0546, "step": 8100 }, { "epoch": 5.73, "learning_rate": 1.8149307107733573e-05, "loss": 1.0579, "step": 8200 }, { "epoch": 5.8, "learning_rate": 1.7590523021904336e-05, "loss": 1.0204, "step": 8300 }, { "epoch": 5.87, "learning_rate": 1.70317389360751e-05, "loss": 1.0398, "step": 8400 }, { "epoch": 5.94, "learning_rate": 1.6472954850245866e-05, "loss": 0.9992, "step": 8500 }, { "epoch": 6.01, "learning_rate": 1.5914170764416632e-05, "loss": 0.9756, "step": 8600 }, { "epoch": 6.08, "learning_rate": 1.5355386678587395e-05, "loss": 0.8385, "step": 8700 }, { "epoch": 6.15, "learning_rate": 1.479660259275816e-05, "loss": 0.8815, "step": 8800 }, { "epoch": 6.22, "learning_rate": 1.4237818506928924e-05, "loss": 0.8447, "step": 8900 }, { "epoch": 6.29, "learning_rate": 1.3679034421099687e-05, "loss": 0.8553, "step": 9000 }, { "epoch": 6.36, "learning_rate": 1.312025033527045e-05, "loss": 0.8188, "step": 9100 }, { "epoch": 6.43, "learning_rate": 1.2561466249441217e-05, "loss": 0.8241, "step": 9200 }, { "epoch": 6.5, "learning_rate": 1.2002682163611981e-05, "loss": 0.8118, "step": 9300 }, { "epoch": 6.57, "learning_rate": 1.1443898077782746e-05, "loss": 0.8357, "step": 9400 }, { "epoch": 6.64, "learning_rate": 1.0885113991953509e-05, "loss": 0.8063, "step": 9500 }, { "epoch": 6.71, "learning_rate": 1.0326329906124274e-05, "loss": 0.8263, "step": 9600 }, { "epoch": 6.78, "learning_rate": 9.767545820295038e-06, "loss": 0.8064, "step": 9700 }, { "epoch": 6.85, "learning_rate": 9.208761734465803e-06, "loss": 0.7858, "step": 9800 }, { "epoch": 6.92, "learning_rate": 8.649977648636568e-06, "loss": 0.7854, "step": 9900 }, { "epoch": 6.99, "learning_rate": 8.09119356280733e-06, "loss": 0.7765, "step": 10000 }, { "epoch": 6.99, "eval_bleu": 74.7723, "eval_em": 0.0349, "eval_gen_len": 46.1685, "eval_loss": 1.0809996128082275, "eval_runtime": 352.8566, "eval_samples_per_second": 3.415, "eval_steps_per_second": 0.428, "step": 10000 }, { "epoch": 7.06, "learning_rate": 7.532409476978096e-06, "loss": 0.696, "step": 10100 }, { "epoch": 7.13, "learning_rate": 6.973625391148861e-06, "loss": 0.6991, "step": 10200 }, { "epoch": 7.2, "learning_rate": 6.414841305319625e-06, "loss": 0.6795, "step": 10300 }, { "epoch": 7.27, "learning_rate": 5.856057219490389e-06, "loss": 0.6953, "step": 10400 }, { "epoch": 7.34, "learning_rate": 5.297273133661153e-06, "loss": 0.6854, "step": 10500 }, { "epoch": 7.41, "learning_rate": 4.738489047831918e-06, "loss": 0.6798, "step": 10600 }, { "epoch": 7.48, "learning_rate": 4.1797049620026825e-06, "loss": 0.6701, "step": 10700 }, { "epoch": 7.55, "learning_rate": 3.6209208761734468e-06, "loss": 0.684, "step": 10800 }, { "epoch": 7.62, "learning_rate": 3.062136790344211e-06, "loss": 0.686, "step": 10900 }, { "epoch": 7.69, "learning_rate": 2.5033527045149757e-06, "loss": 0.6764, "step": 11000 }, { "epoch": 7.76, "learning_rate": 1.94456861868574e-06, "loss": 0.6461, "step": 11100 }, { "epoch": 7.83, "learning_rate": 1.3857845328565042e-06, "loss": 0.6723, "step": 11200 }, { "epoch": 7.9, "learning_rate": 8.270004470272687e-07, "loss": 0.6453, "step": 11300 }, { "epoch": 7.97, "learning_rate": 2.682163611980331e-07, "loss": 0.6544, "step": 11400 }, { "epoch": 8.0, "step": 11448, "total_flos": 7528078235838336.0, "train_loss": 6.2339570505647375, "train_runtime": 11756.9791, "train_samples_per_second": 15.575, "train_steps_per_second": 0.974 } ], "max_steps": 11448, "num_train_epochs": 8, "total_flos": 7528078235838336.0, "trial_name": null, "trial_params": null }