{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9813318473112287, "eval_steps": 500, "global_step": 53500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 1.1059476137161255, "learning_rate": 1.981424723692765e-05, "loss": 5.2962, "step": 500 }, { "epoch": 0.06, "grad_norm": 1.074090838432312, "learning_rate": 1.96284944738553e-05, "loss": 5.2074, "step": 1000 }, { "epoch": 0.08, "grad_norm": 2.9232094287872314, "learning_rate": 1.944274171078295e-05, "loss": 5.0464, "step": 1500 }, { "epoch": 0.11, "grad_norm": 1.1528608798980713, "learning_rate": 1.9256988947710598e-05, "loss": 4.9315, "step": 2000 }, { "epoch": 0.14, "grad_norm": 2.6015784740448, "learning_rate": 1.9071236184638247e-05, "loss": 4.9604, "step": 2500 }, { "epoch": 0.17, "grad_norm": 1.2907854318618774, "learning_rate": 1.8885483421565897e-05, "loss": 4.825, "step": 3000 }, { "epoch": 0.2, "grad_norm": 2.6447720527648926, "learning_rate": 1.8699730658493546e-05, "loss": 4.6561, "step": 3500 }, { "epoch": 0.22, "grad_norm": 0.8539524674415588, "learning_rate": 1.8513977895421195e-05, "loss": 5.0517, "step": 4000 }, { "epoch": 0.25, "grad_norm": 0.9389849901199341, "learning_rate": 1.8328225132348844e-05, "loss": 4.7113, "step": 4500 }, { "epoch": 0.28, "grad_norm": 1.793870210647583, "learning_rate": 1.8142472369276493e-05, "loss": 4.8306, "step": 5000 }, { "epoch": 0.31, "grad_norm": 2.8396799564361572, "learning_rate": 1.7956719606204142e-05, "loss": 4.6542, "step": 5500 }, { "epoch": 0.33, "grad_norm": 1.3629398345947266, "learning_rate": 1.7770966843131795e-05, "loss": 4.8343, "step": 6000 }, { "epoch": 0.36, "grad_norm": 1.432026743888855, "learning_rate": 1.7585214080059444e-05, "loss": 4.756, "step": 6500 }, { "epoch": 0.39, "grad_norm": 1.616821527481079, "learning_rate": 1.7399461316987093e-05, "loss": 4.5546, "step": 7000 }, { "epoch": 0.42, "grad_norm": 1.8740485906600952, "learning_rate": 1.7213708553914742e-05, "loss": 4.869, "step": 7500 }, { "epoch": 0.45, "grad_norm": 3.150221586227417, "learning_rate": 1.702795579084239e-05, "loss": 4.8451, "step": 8000 }, { "epoch": 0.47, "grad_norm": 1.4727258682250977, "learning_rate": 1.6842203027770037e-05, "loss": 4.8123, "step": 8500 }, { "epoch": 0.5, "grad_norm": 1.3298438787460327, "learning_rate": 1.6656450264697686e-05, "loss": 4.833, "step": 9000 }, { "epoch": 0.53, "grad_norm": 1.3734725713729858, "learning_rate": 1.647069750162534e-05, "loss": 4.2936, "step": 9500 }, { "epoch": 0.56, "grad_norm": 1.323432207107544, "learning_rate": 1.6284944738552988e-05, "loss": 4.0395, "step": 10000 }, { "epoch": 0.59, "grad_norm": 1.5172241926193237, "learning_rate": 1.6099191975480637e-05, "loss": 4.243, "step": 10500 }, { "epoch": 0.61, "grad_norm": 1.2672388553619385, "learning_rate": 1.5913439212408286e-05, "loss": 4.1569, "step": 11000 }, { "epoch": 0.64, "grad_norm": 1.1766120195388794, "learning_rate": 1.5727686449335935e-05, "loss": 4.4089, "step": 11500 }, { "epoch": 0.67, "grad_norm": 1.3315163850784302, "learning_rate": 1.5541933686263585e-05, "loss": 4.8991, "step": 12000 }, { "epoch": 0.7, "grad_norm": 1.3195267915725708, "learning_rate": 1.5356180923191234e-05, "loss": 4.6714, "step": 12500 }, { "epoch": 0.72, "grad_norm": 1.1769589185714722, "learning_rate": 1.5170428160118885e-05, "loss": 4.6833, "step": 13000 }, { "epoch": 0.75, "grad_norm": 2.954596996307373, "learning_rate": 1.4984675397046534e-05, "loss": 4.6754, "step": 13500 }, { "epoch": 0.78, "grad_norm": 1.6191766262054443, "learning_rate": 1.4798922633974183e-05, "loss": 4.5673, "step": 14000 }, { "epoch": 0.81, "grad_norm": 1.709163784980774, "learning_rate": 1.461316987090183e-05, "loss": 4.5477, "step": 14500 }, { "epoch": 0.84, "grad_norm": 1.679591417312622, "learning_rate": 1.442741710782948e-05, "loss": 4.2485, "step": 15000 }, { "epoch": 0.86, "grad_norm": 1.4663037061691284, "learning_rate": 1.4241664344757129e-05, "loss": 4.2211, "step": 15500 }, { "epoch": 0.89, "grad_norm": 1.1792720556259155, "learning_rate": 1.4055911581684778e-05, "loss": 4.1774, "step": 16000 }, { "epoch": 0.92, "grad_norm": 2.4920449256896973, "learning_rate": 1.3870158818612427e-05, "loss": 3.9501, "step": 16500 }, { "epoch": 0.95, "grad_norm": 1.2305946350097656, "learning_rate": 1.3684406055540078e-05, "loss": 4.0126, "step": 17000 }, { "epoch": 0.98, "grad_norm": 1.3168740272521973, "learning_rate": 1.3498653292467727e-05, "loss": 4.4881, "step": 17500 }, { "epoch": 1.0, "eval_generated_length": 19.0, "eval_loss": 1.565665364265442, "eval_rouge1": 0.219, "eval_rouge2": 0.1045, "eval_rougeL": 0.1858, "eval_rougeLsum": 0.1858, "eval_runtime": 789.2418, "eval_samples_per_second": 14.558, "eval_steps_per_second": 0.911, "step": 17945 }, { "epoch": 1.0, "grad_norm": 1.1441638469696045, "learning_rate": 1.3312900529395376e-05, "loss": 4.3781, "step": 18000 }, { "epoch": 1.03, "grad_norm": 1.1992547512054443, "learning_rate": 1.3127147766323025e-05, "loss": 2.8783, "step": 18500 }, { "epoch": 1.06, "grad_norm": 1.225577473640442, "learning_rate": 1.2941395003250674e-05, "loss": 3.0795, "step": 19000 }, { "epoch": 1.09, "grad_norm": 1.3939582109451294, "learning_rate": 1.2755642240178324e-05, "loss": 3.1704, "step": 19500 }, { "epoch": 1.11, "grad_norm": 1.366357684135437, "learning_rate": 1.2569889477105973e-05, "loss": 2.9538, "step": 20000 }, { "epoch": 1.14, "grad_norm": 1.182193398475647, "learning_rate": 1.2384136714033624e-05, "loss": 2.9475, "step": 20500 }, { "epoch": 1.17, "grad_norm": 1.2305201292037964, "learning_rate": 1.2198383950961273e-05, "loss": 2.6189, "step": 21000 }, { "epoch": 1.2, "grad_norm": 1.0837234258651733, "learning_rate": 1.2012631187888922e-05, "loss": 2.6662, "step": 21500 }, { "epoch": 1.23, "grad_norm": 1.2700753211975098, "learning_rate": 1.1826878424816571e-05, "loss": 2.8915, "step": 22000 }, { "epoch": 1.25, "grad_norm": 1.0048373937606812, "learning_rate": 1.1641125661744218e-05, "loss": 2.9367, "step": 22500 }, { "epoch": 1.28, "grad_norm": 1.1926288604736328, "learning_rate": 1.1455372898671868e-05, "loss": 2.9525, "step": 23000 }, { "epoch": 1.31, "grad_norm": 2.236363410949707, "learning_rate": 1.1269620135599517e-05, "loss": 3.1114, "step": 23500 }, { "epoch": 1.34, "grad_norm": 1.1560380458831787, "learning_rate": 1.1083867372527166e-05, "loss": 2.8436, "step": 24000 }, { "epoch": 1.37, "grad_norm": 1.6830860376358032, "learning_rate": 1.0898114609454817e-05, "loss": 2.8528, "step": 24500 }, { "epoch": 1.39, "grad_norm": 1.0211920738220215, "learning_rate": 1.0712361846382466e-05, "loss": 2.8839, "step": 25000 }, { "epoch": 1.42, "grad_norm": 1.1330536603927612, "learning_rate": 1.0526609083310115e-05, "loss": 2.9946, "step": 25500 }, { "epoch": 1.45, "grad_norm": 1.6771241426467896, "learning_rate": 1.0340856320237764e-05, "loss": 3.0686, "step": 26000 }, { "epoch": 1.48, "grad_norm": 1.409925103187561, "learning_rate": 1.0155103557165413e-05, "loss": 3.118, "step": 26500 }, { "epoch": 1.5, "grad_norm": 1.1481200456619263, "learning_rate": 9.969350794093064e-06, "loss": 3.1728, "step": 27000 }, { "epoch": 1.53, "grad_norm": 1.0455875396728516, "learning_rate": 9.783598031020712e-06, "loss": 3.144, "step": 27500 }, { "epoch": 1.56, "grad_norm": 1.1866121292114258, "learning_rate": 9.59784526794836e-06, "loss": 3.0224, "step": 28000 }, { "epoch": 1.59, "grad_norm": 1.1943082809448242, "learning_rate": 9.41209250487601e-06, "loss": 4.6478, "step": 28500 }, { "epoch": 1.62, "grad_norm": 1.3221793174743652, "learning_rate": 9.22633974180366e-06, "loss": 5.313, "step": 29000 }, { "epoch": 1.64, "grad_norm": 1.3978910446166992, "learning_rate": 9.04058697873131e-06, "loss": 5.366, "step": 29500 }, { "epoch": 1.67, "grad_norm": 1.257312536239624, "learning_rate": 8.854834215658959e-06, "loss": 5.4676, "step": 30000 }, { "epoch": 1.7, "grad_norm": 1.4381968975067139, "learning_rate": 8.669081452586608e-06, "loss": 5.3774, "step": 30500 }, { "epoch": 1.73, "grad_norm": 2.0183818340301514, "learning_rate": 8.483328689514257e-06, "loss": 5.2591, "step": 31000 }, { "epoch": 1.76, "grad_norm": 1.2171275615692139, "learning_rate": 8.297575926441907e-06, "loss": 5.4122, "step": 31500 }, { "epoch": 1.78, "grad_norm": 1.3053938150405884, "learning_rate": 8.111823163369556e-06, "loss": 5.3049, "step": 32000 }, { "epoch": 1.81, "grad_norm": 1.499539852142334, "learning_rate": 7.926070400297205e-06, "loss": 5.3328, "step": 32500 }, { "epoch": 1.84, "grad_norm": 1.347572684288025, "learning_rate": 7.740317637224854e-06, "loss": 5.3964, "step": 33000 }, { "epoch": 1.87, "grad_norm": 1.2193210124969482, "learning_rate": 7.554564874152503e-06, "loss": 5.3962, "step": 33500 }, { "epoch": 1.89, "grad_norm": 2.145545482635498, "learning_rate": 7.368812111080152e-06, "loss": 5.2661, "step": 34000 }, { "epoch": 1.92, "grad_norm": 1.8138571977615356, "learning_rate": 7.183059348007802e-06, "loss": 5.3785, "step": 34500 }, { "epoch": 1.95, "grad_norm": 1.9416179656982422, "learning_rate": 6.997306584935451e-06, "loss": 5.3526, "step": 35000 }, { "epoch": 1.98, "grad_norm": 1.0953854322433472, "learning_rate": 6.8115538218631005e-06, "loss": 5.345, "step": 35500 }, { "epoch": 2.0, "eval_generated_length": 19.0, "eval_loss": 1.5637397766113281, "eval_rouge1": 0.2186, "eval_rouge2": 0.1038, "eval_rougeL": 0.185, "eval_rougeLsum": 0.185, "eval_runtime": 790.9995, "eval_samples_per_second": 14.526, "eval_steps_per_second": 0.909, "step": 35890 }, { "epoch": 2.01, "grad_norm": 1.6087472438812256, "learning_rate": 6.62580105879075e-06, "loss": 4.877, "step": 36000 }, { "epoch": 2.03, "grad_norm": 1.424822449684143, "learning_rate": 6.4400482957184e-06, "loss": 3.7964, "step": 36500 }, { "epoch": 2.06, "grad_norm": 1.7608376741409302, "learning_rate": 6.254295532646049e-06, "loss": 3.6988, "step": 37000 }, { "epoch": 2.09, "grad_norm": 1.1424431800842285, "learning_rate": 6.068542769573698e-06, "loss": 3.5234, "step": 37500 }, { "epoch": 2.12, "grad_norm": 1.307145595550537, "learning_rate": 5.882790006501348e-06, "loss": 3.5082, "step": 38000 }, { "epoch": 2.15, "grad_norm": 1.7545298337936401, "learning_rate": 5.697037243428996e-06, "loss": 3.5398, "step": 38500 }, { "epoch": 2.17, "grad_norm": 1.009940505027771, "learning_rate": 5.5112844803566454e-06, "loss": 3.6597, "step": 39000 }, { "epoch": 2.2, "grad_norm": 1.2161884307861328, "learning_rate": 5.325531717284295e-06, "loss": 4.1624, "step": 39500 }, { "epoch": 2.23, "grad_norm": 1.7260109186172485, "learning_rate": 5.139778954211945e-06, "loss": 4.005, "step": 40000 }, { "epoch": 2.26, "grad_norm": 1.5710057020187378, "learning_rate": 4.954026191139594e-06, "loss": 4.0661, "step": 40500 }, { "epoch": 2.28, "grad_norm": 1.6778111457824707, "learning_rate": 4.768273428067243e-06, "loss": 3.9081, "step": 41000 }, { "epoch": 2.31, "grad_norm": 1.3847213983535767, "learning_rate": 4.582520664994892e-06, "loss": 3.8309, "step": 41500 }, { "epoch": 2.34, "grad_norm": 1.128096342086792, "learning_rate": 4.396767901922541e-06, "loss": 3.6819, "step": 42000 }, { "epoch": 2.37, "grad_norm": 1.5037422180175781, "learning_rate": 4.21101513885019e-06, "loss": 3.566, "step": 42500 }, { "epoch": 2.4, "grad_norm": 1.3365986347198486, "learning_rate": 4.02526237577784e-06, "loss": 3.2959, "step": 43000 }, { "epoch": 2.42, "grad_norm": 1.167784333229065, "learning_rate": 3.8395096127054895e-06, "loss": 3.4172, "step": 43500 }, { "epoch": 2.45, "grad_norm": 1.3119831085205078, "learning_rate": 3.6537568496331386e-06, "loss": 3.5034, "step": 44000 }, { "epoch": 2.48, "grad_norm": 1.2421696186065674, "learning_rate": 3.4680040865607878e-06, "loss": 4.1109, "step": 44500 }, { "epoch": 2.51, "grad_norm": 1.1798334121704102, "learning_rate": 3.2822513234884373e-06, "loss": 4.0948, "step": 45000 }, { "epoch": 2.54, "grad_norm": 1.1406497955322266, "learning_rate": 3.0964985604160865e-06, "loss": 3.9285, "step": 45500 }, { "epoch": 2.56, "grad_norm": 1.6897916793823242, "learning_rate": 2.9107457973437356e-06, "loss": 3.705, "step": 46000 }, { "epoch": 2.59, "grad_norm": 1.162559151649475, "learning_rate": 2.724993034271385e-06, "loss": 3.4343, "step": 46500 }, { "epoch": 2.62, "grad_norm": 1.2140285968780518, "learning_rate": 2.5392402711990344e-06, "loss": 3.2604, "step": 47000 }, { "epoch": 2.65, "grad_norm": 1.20240318775177, "learning_rate": 2.3534875081266835e-06, "loss": 3.5073, "step": 47500 }, { "epoch": 2.67, "grad_norm": 1.1845455169677734, "learning_rate": 2.167734745054333e-06, "loss": 3.1102, "step": 48000 }, { "epoch": 2.7, "grad_norm": 1.2270458936691284, "learning_rate": 1.9819819819819822e-06, "loss": 3.1802, "step": 48500 }, { "epoch": 2.73, "grad_norm": 1.1969292163848877, "learning_rate": 1.7962292189096314e-06, "loss": 3.1325, "step": 49000 }, { "epoch": 2.76, "grad_norm": 1.2453925609588623, "learning_rate": 1.6104764558372807e-06, "loss": 2.9085, "step": 49500 }, { "epoch": 2.79, "grad_norm": 1.3069303035736084, "learning_rate": 1.42472369276493e-06, "loss": 2.9761, "step": 50000 }, { "epoch": 2.81, "grad_norm": 1.4485862255096436, "learning_rate": 1.2389709296925793e-06, "loss": 3.5526, "step": 50500 }, { "epoch": 2.84, "grad_norm": 1.4915004968643188, "learning_rate": 1.0532181666202286e-06, "loss": 3.7851, "step": 51000 }, { "epoch": 2.87, "grad_norm": 1.271291971206665, "learning_rate": 8.674654035478779e-07, "loss": 3.9262, "step": 51500 }, { "epoch": 2.9, "grad_norm": 1.3475127220153809, "learning_rate": 6.817126404755271e-07, "loss": 4.0264, "step": 52000 }, { "epoch": 2.93, "grad_norm": 1.2061536312103271, "learning_rate": 4.959598774031764e-07, "loss": 3.9977, "step": 52500 }, { "epoch": 2.95, "grad_norm": 1.8415242433547974, "learning_rate": 3.102071143308257e-07, "loss": 4.0098, "step": 53000 }, { "epoch": 2.98, "grad_norm": 1.0884873867034912, "learning_rate": 1.2445435125847498e-07, "loss": 4.0167, "step": 53500 } ], "logging_steps": 500, "max_steps": 53835, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.317005244339323e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }