|
{ |
|
"best_metric": 0.5004217028617859, |
|
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google_t5/t5_base_ledgar/checkpoint-2800", |
|
"epoch": 3.0, |
|
"eval_steps": 100, |
|
"global_step": 2814, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.0540711879730225, |
|
"learning_rate": 0.0004955579246624023, |
|
"loss": 4.1527, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.241663694381714, |
|
"learning_rate": 0.0004911158493248046, |
|
"loss": 2.8808, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.093815803527832, |
|
"learning_rate": 0.00048667377398720687, |
|
"loss": 1.9075, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.396036148071289, |
|
"learning_rate": 0.0004822316986496091, |
|
"loss": 1.443, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_accuracy": 0.7291, |
|
"eval_f1_macro": 0.5312263291596806, |
|
"eval_f1_micro": 0.7291, |
|
"eval_loss": 1.113275408744812, |
|
"eval_runtime": 24.8696, |
|
"eval_samples_per_second": 402.097, |
|
"eval_steps_per_second": 6.313, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.6226117610931396, |
|
"learning_rate": 0.0004777896233120114, |
|
"loss": 1.1778, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.527249574661255, |
|
"learning_rate": 0.00047334754797441367, |
|
"loss": 1.0394, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.435964584350586, |
|
"learning_rate": 0.00046890547263681595, |
|
"loss": 0.9535, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.0345797538757324, |
|
"learning_rate": 0.00046446339729921824, |
|
"loss": 0.8813, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_accuracy": 0.7712, |
|
"eval_f1_macro": 0.6296223926135491, |
|
"eval_f1_micro": 0.7712, |
|
"eval_loss": 0.8404093980789185, |
|
"eval_runtime": 24.3258, |
|
"eval_samples_per_second": 411.086, |
|
"eval_steps_per_second": 6.454, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.247000217437744, |
|
"learning_rate": 0.0004600213219616205, |
|
"loss": 0.8062, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.9799127578735352, |
|
"learning_rate": 0.00045557924662402275, |
|
"loss": 0.8654, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.0902786254882812, |
|
"learning_rate": 0.000451137171286425, |
|
"loss": 0.8073, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.7158573865890503, |
|
"learning_rate": 0.00044669509594882727, |
|
"loss": 0.761, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy": 0.8021, |
|
"eval_f1_macro": 0.6788544961571827, |
|
"eval_f1_micro": 0.8021, |
|
"eval_loss": 0.738567590713501, |
|
"eval_runtime": 24.2955, |
|
"eval_samples_per_second": 411.599, |
|
"eval_steps_per_second": 6.462, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.2180001735687256, |
|
"learning_rate": 0.00044225302061122956, |
|
"loss": 0.7375, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.029712677001953, |
|
"learning_rate": 0.00043781094527363184, |
|
"loss": 0.7601, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.0835516452789307, |
|
"learning_rate": 0.0004333688699360341, |
|
"loss": 0.6685, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.2809343338012695, |
|
"learning_rate": 0.0004289267945984364, |
|
"loss": 0.7358, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_accuracy": 0.805, |
|
"eval_f1_macro": 0.6786916719278343, |
|
"eval_f1_micro": 0.805, |
|
"eval_loss": 0.731271505355835, |
|
"eval_runtime": 24.2168, |
|
"eval_samples_per_second": 412.937, |
|
"eval_steps_per_second": 6.483, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.0364644527435303, |
|
"learning_rate": 0.00042448471926083864, |
|
"loss": 0.7414, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.7770190238952637, |
|
"learning_rate": 0.00042004264392324093, |
|
"loss": 0.8142, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 9.388680458068848, |
|
"learning_rate": 0.0004156005685856432, |
|
"loss": 0.7282, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.113584041595459, |
|
"learning_rate": 0.0004111584932480455, |
|
"loss": 0.7624, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.8164, |
|
"eval_f1_macro": 0.7134072796381032, |
|
"eval_f1_micro": 0.8164, |
|
"eval_loss": 0.6560911536216736, |
|
"eval_runtime": 24.2135, |
|
"eval_samples_per_second": 412.993, |
|
"eval_steps_per_second": 6.484, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.490932822227478, |
|
"learning_rate": 0.0004067164179104478, |
|
"loss": 0.7123, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.1674153804779053, |
|
"learning_rate": 0.00040227434257285007, |
|
"loss": 0.692, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.5101075172424316, |
|
"learning_rate": 0.00039783226723525235, |
|
"loss": 0.6392, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.797776460647583, |
|
"learning_rate": 0.0003933901918976546, |
|
"loss": 0.7067, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_accuracy": 0.821, |
|
"eval_f1_macro": 0.7273350797174626, |
|
"eval_f1_micro": 0.821, |
|
"eval_loss": 0.6418954133987427, |
|
"eval_runtime": 24.1496, |
|
"eval_samples_per_second": 414.086, |
|
"eval_steps_per_second": 6.501, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.9408563375473022, |
|
"learning_rate": 0.00038894811656005687, |
|
"loss": 0.6762, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.9674413204193115, |
|
"learning_rate": 0.00038450604122245916, |
|
"loss": 0.6784, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.5168087482452393, |
|
"learning_rate": 0.00038006396588486144, |
|
"loss": 0.7106, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.7439873218536377, |
|
"learning_rate": 0.0003756218905472637, |
|
"loss": 0.6298, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_accuracy": 0.8254, |
|
"eval_f1_macro": 0.7229632924236448, |
|
"eval_f1_micro": 0.8254, |
|
"eval_loss": 0.6412187218666077, |
|
"eval_runtime": 24.0401, |
|
"eval_samples_per_second": 415.971, |
|
"eval_steps_per_second": 6.531, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.6693323850631714, |
|
"learning_rate": 0.00037117981520966596, |
|
"loss": 0.6267, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.042109251022339, |
|
"learning_rate": 0.00036673773987206824, |
|
"loss": 0.7153, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.5407096147537231, |
|
"learning_rate": 0.0003622956645344705, |
|
"loss": 0.6641, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.5067718029022217, |
|
"learning_rate": 0.00035785358919687276, |
|
"loss": 0.6544, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_accuracy": 0.8217, |
|
"eval_f1_macro": 0.7223377361999187, |
|
"eval_f1_micro": 0.8217, |
|
"eval_loss": 0.6277242302894592, |
|
"eval_runtime": 24.1366, |
|
"eval_samples_per_second": 414.309, |
|
"eval_steps_per_second": 6.505, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.1698966026306152, |
|
"learning_rate": 0.00035341151385927504, |
|
"loss": 0.7108, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.2698220014572144, |
|
"learning_rate": 0.00034896943852167733, |
|
"loss": 0.5897, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.0006988048553467, |
|
"learning_rate": 0.0003445273631840796, |
|
"loss": 0.6386, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.7718091011047363, |
|
"learning_rate": 0.0003400852878464819, |
|
"loss": 0.5781, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_accuracy": 0.8305, |
|
"eval_f1_macro": 0.7420217283556048, |
|
"eval_f1_micro": 0.8305, |
|
"eval_loss": 0.6054204702377319, |
|
"eval_runtime": 24.1901, |
|
"eval_samples_per_second": 413.393, |
|
"eval_steps_per_second": 6.49, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.9609644412994385, |
|
"learning_rate": 0.00033564321250888413, |
|
"loss": 0.6415, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.4652588367462158, |
|
"learning_rate": 0.0003312011371712864, |
|
"loss": 0.5699, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.6652113199234009, |
|
"learning_rate": 0.0003267590618336887, |
|
"loss": 0.486, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.3951687812805176, |
|
"learning_rate": 0.000322316986496091, |
|
"loss": 0.4674, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_accuracy": 0.8346, |
|
"eval_f1_macro": 0.737133370653897, |
|
"eval_f1_micro": 0.8346, |
|
"eval_loss": 0.6210275292396545, |
|
"eval_runtime": 24.1347, |
|
"eval_samples_per_second": 414.342, |
|
"eval_steps_per_second": 6.505, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.52338445186615, |
|
"learning_rate": 0.00031787491115849327, |
|
"loss": 0.4979, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.9127336740493774, |
|
"learning_rate": 0.00031343283582089556, |
|
"loss": 0.4658, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.5914901494979858, |
|
"learning_rate": 0.00030899076048329784, |
|
"loss": 0.5625, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.0196564197540283, |
|
"learning_rate": 0.0003045486851457001, |
|
"loss": 0.4929, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"eval_accuracy": 0.8387, |
|
"eval_f1_macro": 0.7423130077227065, |
|
"eval_f1_micro": 0.8387, |
|
"eval_loss": 0.5875550508499146, |
|
"eval_runtime": 24.1428, |
|
"eval_samples_per_second": 414.202, |
|
"eval_steps_per_second": 6.503, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.3181229829788208, |
|
"learning_rate": 0.00030010660980810236, |
|
"loss": 0.4999, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.321296215057373, |
|
"learning_rate": 0.00029566453447050464, |
|
"loss": 0.4456, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.4967468976974487, |
|
"learning_rate": 0.0002912224591329069, |
|
"loss": 0.443, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.622883915901184, |
|
"learning_rate": 0.00028678038379530916, |
|
"loss": 0.566, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_accuracy": 0.8475, |
|
"eval_f1_macro": 0.7633448680546241, |
|
"eval_f1_micro": 0.8475, |
|
"eval_loss": 0.5779463648796082, |
|
"eval_runtime": 24.1848, |
|
"eval_samples_per_second": 413.483, |
|
"eval_steps_per_second": 6.492, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.2820820808410645, |
|
"learning_rate": 0.00028233830845771145, |
|
"loss": 0.5382, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.5108492374420166, |
|
"learning_rate": 0.00027789623312011373, |
|
"loss": 0.5314, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.2868916988372803, |
|
"learning_rate": 0.00027345415778251596, |
|
"loss": 0.4951, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.0262043476104736, |
|
"learning_rate": 0.00026901208244491825, |
|
"loss": 0.4577, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"eval_accuracy": 0.8435, |
|
"eval_f1_macro": 0.7507843808294652, |
|
"eval_f1_micro": 0.8435, |
|
"eval_loss": 0.5771787762641907, |
|
"eval_runtime": 24.1138, |
|
"eval_samples_per_second": 414.7, |
|
"eval_steps_per_second": 6.511, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.7890022993087769, |
|
"learning_rate": 0.00026457000710732053, |
|
"loss": 0.5097, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.5777374505996704, |
|
"learning_rate": 0.0002601279317697228, |
|
"loss": 0.4894, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.6835675239562988, |
|
"learning_rate": 0.0002556858564321251, |
|
"loss": 0.4487, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.7419664859771729, |
|
"learning_rate": 0.0002512437810945274, |
|
"loss": 0.4233, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"eval_accuracy": 0.8476, |
|
"eval_f1_macro": 0.7624728239220356, |
|
"eval_f1_micro": 0.8476, |
|
"eval_loss": 0.5580869913101196, |
|
"eval_runtime": 24.1623, |
|
"eval_samples_per_second": 413.868, |
|
"eval_steps_per_second": 6.498, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.6330546140670776, |
|
"learning_rate": 0.0002468017057569296, |
|
"loss": 0.4985, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.5560388565063477, |
|
"learning_rate": 0.00024235963041933193, |
|
"loss": 0.4902, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.1412893533706665, |
|
"learning_rate": 0.0002379175550817342, |
|
"loss": 0.4811, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.4423662424087524, |
|
"learning_rate": 0.00023347547974413648, |
|
"loss": 0.4567, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_accuracy": 0.8462, |
|
"eval_f1_macro": 0.7575976290013776, |
|
"eval_f1_micro": 0.8462, |
|
"eval_loss": 0.5687991976737976, |
|
"eval_runtime": 24.159, |
|
"eval_samples_per_second": 413.925, |
|
"eval_steps_per_second": 6.499, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.9873683452606201, |
|
"learning_rate": 0.00022903340440653876, |
|
"loss": 0.4455, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.4635204076766968, |
|
"learning_rate": 0.000224591329068941, |
|
"loss": 0.4457, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.56647789478302, |
|
"learning_rate": 0.00022014925373134328, |
|
"loss": 0.4223, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.8017354011535645, |
|
"learning_rate": 0.00021570717839374556, |
|
"loss": 0.483, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_accuracy": 0.8478, |
|
"eval_f1_macro": 0.7608717953670078, |
|
"eval_f1_micro": 0.8478, |
|
"eval_loss": 0.5547010898590088, |
|
"eval_runtime": 24.1023, |
|
"eval_samples_per_second": 414.897, |
|
"eval_steps_per_second": 6.514, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.768385887145996, |
|
"learning_rate": 0.00021126510305614785, |
|
"loss": 0.4593, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.8453024625778198, |
|
"learning_rate": 0.0002068230277185501, |
|
"loss": 0.4378, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.5103825330734253, |
|
"learning_rate": 0.0002023809523809524, |
|
"loss": 0.455, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.8187497854232788, |
|
"learning_rate": 0.00019793887704335468, |
|
"loss": 0.4649, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"eval_accuracy": 0.851, |
|
"eval_f1_macro": 0.7680217546192462, |
|
"eval_f1_micro": 0.851, |
|
"eval_loss": 0.5395861864089966, |
|
"eval_runtime": 24.1531, |
|
"eval_samples_per_second": 414.025, |
|
"eval_steps_per_second": 6.5, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.1151626110076904, |
|
"learning_rate": 0.00019349680170575694, |
|
"loss": 0.5057, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.4344931840896606, |
|
"learning_rate": 0.00018905472636815922, |
|
"loss": 0.4275, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.410400152206421, |
|
"learning_rate": 0.00018461265103056148, |
|
"loss": 0.4973, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.3107552528381348, |
|
"learning_rate": 0.00018017057569296374, |
|
"loss": 0.4288, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_accuracy": 0.8577, |
|
"eval_f1_macro": 0.7759139835888773, |
|
"eval_f1_micro": 0.8577, |
|
"eval_loss": 0.5235319137573242, |
|
"eval_runtime": 24.1535, |
|
"eval_samples_per_second": 414.018, |
|
"eval_steps_per_second": 6.5, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.2763726711273193, |
|
"learning_rate": 0.00017572850035536602, |
|
"loss": 0.5008, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.5869358777999878, |
|
"learning_rate": 0.0001712864250177683, |
|
"loss": 0.4581, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.450799822807312, |
|
"learning_rate": 0.0001668443496801706, |
|
"loss": 0.4103, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.3206963539123535, |
|
"learning_rate": 0.00016240227434257285, |
|
"loss": 0.3445, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"eval_accuracy": 0.8603, |
|
"eval_f1_macro": 0.7790782413257752, |
|
"eval_f1_micro": 0.8603, |
|
"eval_loss": 0.520423948764801, |
|
"eval_runtime": 24.0544, |
|
"eval_samples_per_second": 415.724, |
|
"eval_steps_per_second": 6.527, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.3110324144363403, |
|
"learning_rate": 0.00015796019900497514, |
|
"loss": 0.3278, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.4223313331604004, |
|
"learning_rate": 0.00015351812366737742, |
|
"loss": 0.3376, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.6518670320510864, |
|
"learning_rate": 0.00014907604832977968, |
|
"loss": 0.3519, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.5389341115951538, |
|
"learning_rate": 0.00014463397299218194, |
|
"loss": 0.3014, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"eval_accuracy": 0.8607, |
|
"eval_f1_macro": 0.786161944348828, |
|
"eval_f1_micro": 0.8607, |
|
"eval_loss": 0.5268532037734985, |
|
"eval_runtime": 24.1182, |
|
"eval_samples_per_second": 414.624, |
|
"eval_steps_per_second": 6.51, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.810062289237976, |
|
"learning_rate": 0.00014019189765458422, |
|
"loss": 0.3184, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.4731919765472412, |
|
"learning_rate": 0.00013574982231698648, |
|
"loss": 0.3457, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.8230247497558594, |
|
"learning_rate": 0.00013130774697938877, |
|
"loss": 0.3107, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.9620997905731201, |
|
"learning_rate": 0.00012686567164179105, |
|
"loss": 0.3301, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_accuracy": 0.8591, |
|
"eval_f1_macro": 0.7826263587368261, |
|
"eval_f1_micro": 0.8591, |
|
"eval_loss": 0.5233541131019592, |
|
"eval_runtime": 24.1529, |
|
"eval_samples_per_second": 414.03, |
|
"eval_steps_per_second": 6.5, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 1.9270436763763428, |
|
"learning_rate": 0.0001224235963041933, |
|
"loss": 0.3273, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.2540065050125122, |
|
"learning_rate": 0.0001179815209665956, |
|
"loss": 0.2682, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.6836864948272705, |
|
"learning_rate": 0.00011353944562899787, |
|
"loss": 0.2787, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.7758402824401855, |
|
"learning_rate": 0.00010909737029140014, |
|
"loss": 0.3069, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"eval_accuracy": 0.8624, |
|
"eval_f1_macro": 0.785058711126236, |
|
"eval_f1_micro": 0.8624, |
|
"eval_loss": 0.5265922546386719, |
|
"eval_runtime": 24.117, |
|
"eval_samples_per_second": 414.644, |
|
"eval_steps_per_second": 6.51, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.3381538391113281, |
|
"learning_rate": 0.00010465529495380242, |
|
"loss": 0.2866, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.759792685508728, |
|
"learning_rate": 0.0001002132196162047, |
|
"loss": 0.3031, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.2064933776855469, |
|
"learning_rate": 9.577114427860697e-05, |
|
"loss": 0.3182, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.0604907274246216, |
|
"learning_rate": 9.132906894100924e-05, |
|
"loss": 0.3095, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"eval_accuracy": 0.8629, |
|
"eval_f1_macro": 0.7846414495209991, |
|
"eval_f1_micro": 0.8629, |
|
"eval_loss": 0.5154865980148315, |
|
"eval_runtime": 24.1319, |
|
"eval_samples_per_second": 414.389, |
|
"eval_steps_per_second": 6.506, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.9324097633361816, |
|
"learning_rate": 8.688699360341151e-05, |
|
"loss": 0.3321, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.9070733785629272, |
|
"learning_rate": 8.24449182658138e-05, |
|
"loss": 0.3165, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.3031100034713745, |
|
"learning_rate": 7.800284292821607e-05, |
|
"loss": 0.2939, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.479374647140503, |
|
"learning_rate": 7.356076759061834e-05, |
|
"loss": 0.3164, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_accuracy": 0.8646, |
|
"eval_f1_macro": 0.7909189563960074, |
|
"eval_f1_micro": 0.8646, |
|
"eval_loss": 0.5106394290924072, |
|
"eval_runtime": 24.1589, |
|
"eval_samples_per_second": 413.927, |
|
"eval_steps_per_second": 6.499, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.6215256452560425, |
|
"learning_rate": 6.911869225302061e-05, |
|
"loss": 0.3524, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.5538333654403687, |
|
"learning_rate": 6.467661691542288e-05, |
|
"loss": 0.3354, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.515882134437561, |
|
"learning_rate": 6.023454157782516e-05, |
|
"loss": 0.2992, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.216226100921631, |
|
"learning_rate": 5.579246624022743e-05, |
|
"loss": 0.2914, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"eval_accuracy": 0.8647, |
|
"eval_f1_macro": 0.7934095832580423, |
|
"eval_f1_micro": 0.8647, |
|
"eval_loss": 0.5055064558982849, |
|
"eval_runtime": 24.1523, |
|
"eval_samples_per_second": 414.039, |
|
"eval_steps_per_second": 6.5, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.8203630447387695, |
|
"learning_rate": 5.135039090262971e-05, |
|
"loss": 0.321, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.173407793045044, |
|
"learning_rate": 4.690831556503199e-05, |
|
"loss": 0.3, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.960857629776001, |
|
"learning_rate": 4.2466240227434255e-05, |
|
"loss": 0.3181, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.7531359195709229, |
|
"learning_rate": 3.802416488983653e-05, |
|
"loss": 0.2946, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"eval_accuracy": 0.8643, |
|
"eval_f1_macro": 0.7917093314024903, |
|
"eval_f1_micro": 0.8643, |
|
"eval_loss": 0.502694308757782, |
|
"eval_runtime": 24.1319, |
|
"eval_samples_per_second": 414.389, |
|
"eval_steps_per_second": 6.506, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.2817922830581665, |
|
"learning_rate": 3.3582089552238805e-05, |
|
"loss": 0.2963, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 2.3058297634124756, |
|
"learning_rate": 2.9140014214641083e-05, |
|
"loss": 0.33, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.1777921915054321, |
|
"learning_rate": 2.4697938877043355e-05, |
|
"loss": 0.3122, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.6172949075698853, |
|
"learning_rate": 2.025586353944563e-05, |
|
"loss": 0.3012, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_accuracy": 0.8671, |
|
"eval_f1_macro": 0.7953004112768677, |
|
"eval_f1_micro": 0.8671, |
|
"eval_loss": 0.500918984413147, |
|
"eval_runtime": 24.1653, |
|
"eval_samples_per_second": 413.817, |
|
"eval_steps_per_second": 6.497, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.0941059589385986, |
|
"learning_rate": 1.5813788201847902e-05, |
|
"loss": 0.2869, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.700844168663025, |
|
"learning_rate": 1.1371712864250177e-05, |
|
"loss": 0.2968, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.8403632640838623, |
|
"learning_rate": 6.929637526652452e-06, |
|
"loss": 0.272, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.8172844648361206, |
|
"learning_rate": 2.4875621890547264e-06, |
|
"loss": 0.3181, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_accuracy": 0.8664, |
|
"eval_f1_macro": 0.7947908970820687, |
|
"eval_f1_micro": 0.8664, |
|
"eval_loss": 0.5004217028617859, |
|
"eval_runtime": 24.1769, |
|
"eval_samples_per_second": 413.617, |
|
"eval_steps_per_second": 6.494, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2814, |
|
"total_flos": 2.751004323033907e+16, |
|
"train_loss": 0.5752294131348806, |
|
"train_runtime": 2000.5798, |
|
"train_samples_per_second": 89.974, |
|
"train_steps_per_second": 1.407 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 2814, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 2.751004323033907e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|