|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9813318473112287, |
|
"eval_steps": 500, |
|
"global_step": 53500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.1059476137161255, |
|
"learning_rate": 1.981424723692765e-05, |
|
"loss": 5.2962, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.074090838432312, |
|
"learning_rate": 1.96284944738553e-05, |
|
"loss": 5.2074, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.9232094287872314, |
|
"learning_rate": 1.944274171078295e-05, |
|
"loss": 5.0464, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1528608798980713, |
|
"learning_rate": 1.9256988947710598e-05, |
|
"loss": 4.9315, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.6015784740448, |
|
"learning_rate": 1.9071236184638247e-05, |
|
"loss": 4.9604, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.2907854318618774, |
|
"learning_rate": 1.8885483421565897e-05, |
|
"loss": 4.825, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.6447720527648926, |
|
"learning_rate": 1.8699730658493546e-05, |
|
"loss": 4.6561, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8539524674415588, |
|
"learning_rate": 1.8513977895421195e-05, |
|
"loss": 5.0517, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9389849901199341, |
|
"learning_rate": 1.8328225132348844e-05, |
|
"loss": 4.7113, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.793870210647583, |
|
"learning_rate": 1.8142472369276493e-05, |
|
"loss": 4.8306, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.8396799564361572, |
|
"learning_rate": 1.7956719606204142e-05, |
|
"loss": 4.6542, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3629398345947266, |
|
"learning_rate": 1.7770966843131795e-05, |
|
"loss": 4.8343, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.432026743888855, |
|
"learning_rate": 1.7585214080059444e-05, |
|
"loss": 4.756, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.616821527481079, |
|
"learning_rate": 1.7399461316987093e-05, |
|
"loss": 4.5546, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.8740485906600952, |
|
"learning_rate": 1.7213708553914742e-05, |
|
"loss": 4.869, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.150221586227417, |
|
"learning_rate": 1.702795579084239e-05, |
|
"loss": 4.8451, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.4727258682250977, |
|
"learning_rate": 1.6842203027770037e-05, |
|
"loss": 4.8123, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3298438787460327, |
|
"learning_rate": 1.6656450264697686e-05, |
|
"loss": 4.833, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.3734725713729858, |
|
"learning_rate": 1.647069750162534e-05, |
|
"loss": 4.2936, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.323432207107544, |
|
"learning_rate": 1.6284944738552988e-05, |
|
"loss": 4.0395, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.5172241926193237, |
|
"learning_rate": 1.6099191975480637e-05, |
|
"loss": 4.243, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.2672388553619385, |
|
"learning_rate": 1.5913439212408286e-05, |
|
"loss": 4.1569, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1766120195388794, |
|
"learning_rate": 1.5727686449335935e-05, |
|
"loss": 4.4089, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.3315163850784302, |
|
"learning_rate": 1.5541933686263585e-05, |
|
"loss": 4.8991, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.3195267915725708, |
|
"learning_rate": 1.5356180923191234e-05, |
|
"loss": 4.6714, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1769589185714722, |
|
"learning_rate": 1.5170428160118885e-05, |
|
"loss": 4.6833, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.954596996307373, |
|
"learning_rate": 1.4984675397046534e-05, |
|
"loss": 4.6754, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.6191766262054443, |
|
"learning_rate": 1.4798922633974183e-05, |
|
"loss": 4.5673, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.709163784980774, |
|
"learning_rate": 1.461316987090183e-05, |
|
"loss": 4.5477, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.679591417312622, |
|
"learning_rate": 1.442741710782948e-05, |
|
"loss": 4.2485, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.4663037061691284, |
|
"learning_rate": 1.4241664344757129e-05, |
|
"loss": 4.2211, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.1792720556259155, |
|
"learning_rate": 1.4055911581684778e-05, |
|
"loss": 4.1774, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.4920449256896973, |
|
"learning_rate": 1.3870158818612427e-05, |
|
"loss": 3.9501, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.2305946350097656, |
|
"learning_rate": 1.3684406055540078e-05, |
|
"loss": 4.0126, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.3168740272521973, |
|
"learning_rate": 1.3498653292467727e-05, |
|
"loss": 4.4881, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_generated_length": 19.0, |
|
"eval_loss": 1.565665364265442, |
|
"eval_rouge1": 0.219, |
|
"eval_rouge2": 0.1045, |
|
"eval_rougeL": 0.1858, |
|
"eval_rougeLsum": 0.1858, |
|
"eval_runtime": 789.2418, |
|
"eval_samples_per_second": 14.558, |
|
"eval_steps_per_second": 0.911, |
|
"step": 17945 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.1441638469696045, |
|
"learning_rate": 1.3312900529395376e-05, |
|
"loss": 4.3781, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.1992547512054443, |
|
"learning_rate": 1.3127147766323025e-05, |
|
"loss": 2.8783, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.225577473640442, |
|
"learning_rate": 1.2941395003250674e-05, |
|
"loss": 3.0795, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.3939582109451294, |
|
"learning_rate": 1.2755642240178324e-05, |
|
"loss": 3.1704, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.366357684135437, |
|
"learning_rate": 1.2569889477105973e-05, |
|
"loss": 2.9538, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.182193398475647, |
|
"learning_rate": 1.2384136714033624e-05, |
|
"loss": 2.9475, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.2305201292037964, |
|
"learning_rate": 1.2198383950961273e-05, |
|
"loss": 2.6189, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.0837234258651733, |
|
"learning_rate": 1.2012631187888922e-05, |
|
"loss": 2.6662, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.2700753211975098, |
|
"learning_rate": 1.1826878424816571e-05, |
|
"loss": 2.8915, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0048373937606812, |
|
"learning_rate": 1.1641125661744218e-05, |
|
"loss": 2.9367, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.1926288604736328, |
|
"learning_rate": 1.1455372898671868e-05, |
|
"loss": 2.9525, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.236363410949707, |
|
"learning_rate": 1.1269620135599517e-05, |
|
"loss": 3.1114, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.1560380458831787, |
|
"learning_rate": 1.1083867372527166e-05, |
|
"loss": 2.8436, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.6830860376358032, |
|
"learning_rate": 1.0898114609454817e-05, |
|
"loss": 2.8528, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.0211920738220215, |
|
"learning_rate": 1.0712361846382466e-05, |
|
"loss": 2.8839, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.1330536603927612, |
|
"learning_rate": 1.0526609083310115e-05, |
|
"loss": 2.9946, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.6771241426467896, |
|
"learning_rate": 1.0340856320237764e-05, |
|
"loss": 3.0686, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.409925103187561, |
|
"learning_rate": 1.0155103557165413e-05, |
|
"loss": 3.118, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.1481200456619263, |
|
"learning_rate": 9.969350794093064e-06, |
|
"loss": 3.1728, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.0455875396728516, |
|
"learning_rate": 9.783598031020712e-06, |
|
"loss": 3.144, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.1866121292114258, |
|
"learning_rate": 9.59784526794836e-06, |
|
"loss": 3.0224, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.1943082809448242, |
|
"learning_rate": 9.41209250487601e-06, |
|
"loss": 4.6478, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.3221793174743652, |
|
"learning_rate": 9.22633974180366e-06, |
|
"loss": 5.313, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.3978910446166992, |
|
"learning_rate": 9.04058697873131e-06, |
|
"loss": 5.366, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.257312536239624, |
|
"learning_rate": 8.854834215658959e-06, |
|
"loss": 5.4676, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.4381968975067139, |
|
"learning_rate": 8.669081452586608e-06, |
|
"loss": 5.3774, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.0183818340301514, |
|
"learning_rate": 8.483328689514257e-06, |
|
"loss": 5.2591, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.2171275615692139, |
|
"learning_rate": 8.297575926441907e-06, |
|
"loss": 5.4122, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.3053938150405884, |
|
"learning_rate": 8.111823163369556e-06, |
|
"loss": 5.3049, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.499539852142334, |
|
"learning_rate": 7.926070400297205e-06, |
|
"loss": 5.3328, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.347572684288025, |
|
"learning_rate": 7.740317637224854e-06, |
|
"loss": 5.3964, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.2193210124969482, |
|
"learning_rate": 7.554564874152503e-06, |
|
"loss": 5.3962, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.145545482635498, |
|
"learning_rate": 7.368812111080152e-06, |
|
"loss": 5.2661, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.8138571977615356, |
|
"learning_rate": 7.183059348007802e-06, |
|
"loss": 5.3785, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.9416179656982422, |
|
"learning_rate": 6.997306584935451e-06, |
|
"loss": 5.3526, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.0953854322433472, |
|
"learning_rate": 6.8115538218631005e-06, |
|
"loss": 5.345, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_generated_length": 19.0, |
|
"eval_loss": 1.5637397766113281, |
|
"eval_rouge1": 0.2186, |
|
"eval_rouge2": 0.1038, |
|
"eval_rougeL": 0.185, |
|
"eval_rougeLsum": 0.185, |
|
"eval_runtime": 790.9995, |
|
"eval_samples_per_second": 14.526, |
|
"eval_steps_per_second": 0.909, |
|
"step": 35890 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.6087472438812256, |
|
"learning_rate": 6.62580105879075e-06, |
|
"loss": 4.877, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.424822449684143, |
|
"learning_rate": 6.4400482957184e-06, |
|
"loss": 3.7964, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.7608376741409302, |
|
"learning_rate": 6.254295532646049e-06, |
|
"loss": 3.6988, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.1424431800842285, |
|
"learning_rate": 6.068542769573698e-06, |
|
"loss": 3.5234, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.307145595550537, |
|
"learning_rate": 5.882790006501348e-06, |
|
"loss": 3.5082, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.7545298337936401, |
|
"learning_rate": 5.697037243428996e-06, |
|
"loss": 3.5398, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.009940505027771, |
|
"learning_rate": 5.5112844803566454e-06, |
|
"loss": 3.6597, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.2161884307861328, |
|
"learning_rate": 5.325531717284295e-06, |
|
"loss": 4.1624, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.7260109186172485, |
|
"learning_rate": 5.139778954211945e-06, |
|
"loss": 4.005, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.5710057020187378, |
|
"learning_rate": 4.954026191139594e-06, |
|
"loss": 4.0661, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.6778111457824707, |
|
"learning_rate": 4.768273428067243e-06, |
|
"loss": 3.9081, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.3847213983535767, |
|
"learning_rate": 4.582520664994892e-06, |
|
"loss": 3.8309, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.128096342086792, |
|
"learning_rate": 4.396767901922541e-06, |
|
"loss": 3.6819, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.5037422180175781, |
|
"learning_rate": 4.21101513885019e-06, |
|
"loss": 3.566, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.3365986347198486, |
|
"learning_rate": 4.02526237577784e-06, |
|
"loss": 3.2959, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.167784333229065, |
|
"learning_rate": 3.8395096127054895e-06, |
|
"loss": 3.4172, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.3119831085205078, |
|
"learning_rate": 3.6537568496331386e-06, |
|
"loss": 3.5034, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.2421696186065674, |
|
"learning_rate": 3.4680040865607878e-06, |
|
"loss": 4.1109, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.1798334121704102, |
|
"learning_rate": 3.2822513234884373e-06, |
|
"loss": 4.0948, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.1406497955322266, |
|
"learning_rate": 3.0964985604160865e-06, |
|
"loss": 3.9285, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.6897916793823242, |
|
"learning_rate": 2.9107457973437356e-06, |
|
"loss": 3.705, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.162559151649475, |
|
"learning_rate": 2.724993034271385e-06, |
|
"loss": 3.4343, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.2140285968780518, |
|
"learning_rate": 2.5392402711990344e-06, |
|
"loss": 3.2604, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.20240318775177, |
|
"learning_rate": 2.3534875081266835e-06, |
|
"loss": 3.5073, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.1845455169677734, |
|
"learning_rate": 2.167734745054333e-06, |
|
"loss": 3.1102, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 1.2270458936691284, |
|
"learning_rate": 1.9819819819819822e-06, |
|
"loss": 3.1802, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.1969292163848877, |
|
"learning_rate": 1.7962292189096314e-06, |
|
"loss": 3.1325, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.2453925609588623, |
|
"learning_rate": 1.6104764558372807e-06, |
|
"loss": 2.9085, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.3069303035736084, |
|
"learning_rate": 1.42472369276493e-06, |
|
"loss": 2.9761, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.4485862255096436, |
|
"learning_rate": 1.2389709296925793e-06, |
|
"loss": 3.5526, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.4915004968643188, |
|
"learning_rate": 1.0532181666202286e-06, |
|
"loss": 3.7851, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.271291971206665, |
|
"learning_rate": 8.674654035478779e-07, |
|
"loss": 3.9262, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.3475127220153809, |
|
"learning_rate": 6.817126404755271e-07, |
|
"loss": 4.0264, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.2061536312103271, |
|
"learning_rate": 4.959598774031764e-07, |
|
"loss": 3.9977, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.8415242433547974, |
|
"learning_rate": 3.102071143308257e-07, |
|
"loss": 4.0098, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 1.0884873867034912, |
|
"learning_rate": 1.2445435125847498e-07, |
|
"loss": 4.0167, |
|
"step": 53500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 53835, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 2.317005244339323e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|