SLM_vs_LLM_experiments
/
max_seq_length_128_experiments
/google_t5
/t5_base_ledgar
/checkpoint-2800
/trainer_state.json
{ | |
"best_metric": 0.5004217028617859, | |
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google_t5/t5_base_ledgar/checkpoint-2800", | |
"epoch": 2.9850746268656714, | |
"eval_steps": 100, | |
"global_step": 2800, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.03, | |
"grad_norm": 2.0540711879730225, | |
"learning_rate": 0.0004955579246624023, | |
"loss": 4.1527, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.05, | |
"grad_norm": 2.241663694381714, | |
"learning_rate": 0.0004911158493248046, | |
"loss": 2.8808, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.08, | |
"grad_norm": 2.093815803527832, | |
"learning_rate": 0.00048667377398720687, | |
"loss": 1.9075, | |
"step": 75 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 2.396036148071289, | |
"learning_rate": 0.0004822316986496091, | |
"loss": 1.443, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.11, | |
"eval_accuracy": 0.7291, | |
"eval_f1_macro": 0.5312263291596806, | |
"eval_f1_micro": 0.7291, | |
"eval_loss": 1.113275408744812, | |
"eval_runtime": 24.8696, | |
"eval_samples_per_second": 402.097, | |
"eval_steps_per_second": 6.313, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 2.6226117610931396, | |
"learning_rate": 0.0004777896233120114, | |
"loss": 1.1778, | |
"step": 125 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 2.527249574661255, | |
"learning_rate": 0.00047334754797441367, | |
"loss": 1.0394, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.19, | |
"grad_norm": 2.435964584350586, | |
"learning_rate": 0.00046890547263681595, | |
"loss": 0.9535, | |
"step": 175 | |
}, | |
{ | |
"epoch": 0.21, | |
"grad_norm": 2.0345797538757324, | |
"learning_rate": 0.00046446339729921824, | |
"loss": 0.8813, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.21, | |
"eval_accuracy": 0.7712, | |
"eval_f1_macro": 0.6296223926135491, | |
"eval_f1_micro": 0.7712, | |
"eval_loss": 0.8404093980789185, | |
"eval_runtime": 24.3258, | |
"eval_samples_per_second": 411.086, | |
"eval_steps_per_second": 6.454, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 4.247000217437744, | |
"learning_rate": 0.0004600213219616205, | |
"loss": 0.8062, | |
"step": 225 | |
}, | |
{ | |
"epoch": 0.27, | |
"grad_norm": 1.9799127578735352, | |
"learning_rate": 0.00045557924662402275, | |
"loss": 0.8654, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.29, | |
"grad_norm": 2.0902786254882812, | |
"learning_rate": 0.000451137171286425, | |
"loss": 0.8073, | |
"step": 275 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 1.7158573865890503, | |
"learning_rate": 0.00044669509594882727, | |
"loss": 0.761, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.8021, | |
"eval_f1_macro": 0.6788544961571827, | |
"eval_f1_micro": 0.8021, | |
"eval_loss": 0.738567590713501, | |
"eval_runtime": 24.2955, | |
"eval_samples_per_second": 411.599, | |
"eval_steps_per_second": 6.462, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.35, | |
"grad_norm": 2.2180001735687256, | |
"learning_rate": 0.00044225302061122956, | |
"loss": 0.7375, | |
"step": 325 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 2.029712677001953, | |
"learning_rate": 0.00043781094527363184, | |
"loss": 0.7601, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 2.0835516452789307, | |
"learning_rate": 0.0004333688699360341, | |
"loss": 0.6685, | |
"step": 375 | |
}, | |
{ | |
"epoch": 0.43, | |
"grad_norm": 2.2809343338012695, | |
"learning_rate": 0.0004289267945984364, | |
"loss": 0.7358, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.43, | |
"eval_accuracy": 0.805, | |
"eval_f1_macro": 0.6786916719278343, | |
"eval_f1_micro": 0.805, | |
"eval_loss": 0.731271505355835, | |
"eval_runtime": 24.2168, | |
"eval_samples_per_second": 412.937, | |
"eval_steps_per_second": 6.483, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 2.0364644527435303, | |
"learning_rate": 0.00042448471926083864, | |
"loss": 0.7414, | |
"step": 425 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 2.7770190238952637, | |
"learning_rate": 0.00042004264392324093, | |
"loss": 0.8142, | |
"step": 450 | |
}, | |
{ | |
"epoch": 0.51, | |
"grad_norm": 9.388680458068848, | |
"learning_rate": 0.0004156005685856432, | |
"loss": 0.7282, | |
"step": 475 | |
}, | |
{ | |
"epoch": 0.53, | |
"grad_norm": 2.113584041595459, | |
"learning_rate": 0.0004111584932480455, | |
"loss": 0.7624, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.53, | |
"eval_accuracy": 0.8164, | |
"eval_f1_macro": 0.7134072796381032, | |
"eval_f1_micro": 0.8164, | |
"eval_loss": 0.6560911536216736, | |
"eval_runtime": 24.2135, | |
"eval_samples_per_second": 412.993, | |
"eval_steps_per_second": 6.484, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 1.490932822227478, | |
"learning_rate": 0.0004067164179104478, | |
"loss": 0.7123, | |
"step": 525 | |
}, | |
{ | |
"epoch": 0.59, | |
"grad_norm": 1.1674153804779053, | |
"learning_rate": 0.00040227434257285007, | |
"loss": 0.692, | |
"step": 550 | |
}, | |
{ | |
"epoch": 0.61, | |
"grad_norm": 2.5101075172424316, | |
"learning_rate": 0.00039783226723525235, | |
"loss": 0.6392, | |
"step": 575 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 1.797776460647583, | |
"learning_rate": 0.0003933901918976546, | |
"loss": 0.7067, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.821, | |
"eval_f1_macro": 0.7273350797174626, | |
"eval_f1_micro": 0.821, | |
"eval_loss": 0.6418954133987427, | |
"eval_runtime": 24.1496, | |
"eval_samples_per_second": 414.086, | |
"eval_steps_per_second": 6.501, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.67, | |
"grad_norm": 1.9408563375473022, | |
"learning_rate": 0.00038894811656005687, | |
"loss": 0.6762, | |
"step": 625 | |
}, | |
{ | |
"epoch": 0.69, | |
"grad_norm": 1.9674413204193115, | |
"learning_rate": 0.00038450604122245916, | |
"loss": 0.6784, | |
"step": 650 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 2.5168087482452393, | |
"learning_rate": 0.00038006396588486144, | |
"loss": 0.7106, | |
"step": 675 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 1.7439873218536377, | |
"learning_rate": 0.0003756218905472637, | |
"loss": 0.6298, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.75, | |
"eval_accuracy": 0.8254, | |
"eval_f1_macro": 0.7229632924236448, | |
"eval_f1_micro": 0.8254, | |
"eval_loss": 0.6412187218666077, | |
"eval_runtime": 24.0401, | |
"eval_samples_per_second": 415.971, | |
"eval_steps_per_second": 6.531, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.77, | |
"grad_norm": 1.6693323850631714, | |
"learning_rate": 0.00037117981520966596, | |
"loss": 0.6267, | |
"step": 725 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 2.042109251022339, | |
"learning_rate": 0.00036673773987206824, | |
"loss": 0.7153, | |
"step": 750 | |
}, | |
{ | |
"epoch": 0.83, | |
"grad_norm": 1.5407096147537231, | |
"learning_rate": 0.0003622956645344705, | |
"loss": 0.6641, | |
"step": 775 | |
}, | |
{ | |
"epoch": 0.85, | |
"grad_norm": 1.5067718029022217, | |
"learning_rate": 0.00035785358919687276, | |
"loss": 0.6544, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.85, | |
"eval_accuracy": 0.8217, | |
"eval_f1_macro": 0.7223377361999187, | |
"eval_f1_micro": 0.8217, | |
"eval_loss": 0.6277242302894592, | |
"eval_runtime": 24.1366, | |
"eval_samples_per_second": 414.309, | |
"eval_steps_per_second": 6.505, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 2.1698966026306152, | |
"learning_rate": 0.00035341151385927504, | |
"loss": 0.7108, | |
"step": 825 | |
}, | |
{ | |
"epoch": 0.91, | |
"grad_norm": 1.2698220014572144, | |
"learning_rate": 0.00034896943852167733, | |
"loss": 0.5897, | |
"step": 850 | |
}, | |
{ | |
"epoch": 0.93, | |
"grad_norm": 2.0006988048553467, | |
"learning_rate": 0.0003445273631840796, | |
"loss": 0.6386, | |
"step": 875 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 1.7718091011047363, | |
"learning_rate": 0.0003400852878464819, | |
"loss": 0.5781, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.8305, | |
"eval_f1_macro": 0.7420217283556048, | |
"eval_f1_micro": 0.8305, | |
"eval_loss": 0.6054204702377319, | |
"eval_runtime": 24.1901, | |
"eval_samples_per_second": 413.393, | |
"eval_steps_per_second": 6.49, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.99, | |
"grad_norm": 1.9609644412994385, | |
"learning_rate": 0.00033564321250888413, | |
"loss": 0.6415, | |
"step": 925 | |
}, | |
{ | |
"epoch": 1.01, | |
"grad_norm": 1.4652588367462158, | |
"learning_rate": 0.0003312011371712864, | |
"loss": 0.5699, | |
"step": 950 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 1.6652113199234009, | |
"learning_rate": 0.0003267590618336887, | |
"loss": 0.486, | |
"step": 975 | |
}, | |
{ | |
"epoch": 1.07, | |
"grad_norm": 1.3951687812805176, | |
"learning_rate": 0.000322316986496091, | |
"loss": 0.4674, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 1.07, | |
"eval_accuracy": 0.8346, | |
"eval_f1_macro": 0.737133370653897, | |
"eval_f1_micro": 0.8346, | |
"eval_loss": 0.6210275292396545, | |
"eval_runtime": 24.1347, | |
"eval_samples_per_second": 414.342, | |
"eval_steps_per_second": 6.505, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 1.09, | |
"grad_norm": 1.52338445186615, | |
"learning_rate": 0.00031787491115849327, | |
"loss": 0.4979, | |
"step": 1025 | |
}, | |
{ | |
"epoch": 1.12, | |
"grad_norm": 1.9127336740493774, | |
"learning_rate": 0.00031343283582089556, | |
"loss": 0.4658, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 1.15, | |
"grad_norm": 1.5914901494979858, | |
"learning_rate": 0.00030899076048329784, | |
"loss": 0.5625, | |
"step": 1075 | |
}, | |
{ | |
"epoch": 1.17, | |
"grad_norm": 2.0196564197540283, | |
"learning_rate": 0.0003045486851457001, | |
"loss": 0.4929, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 1.17, | |
"eval_accuracy": 0.8387, | |
"eval_f1_macro": 0.7423130077227065, | |
"eval_f1_micro": 0.8387, | |
"eval_loss": 0.5875550508499146, | |
"eval_runtime": 24.1428, | |
"eval_samples_per_second": 414.202, | |
"eval_steps_per_second": 6.503, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 1.3181229829788208, | |
"learning_rate": 0.00030010660980810236, | |
"loss": 0.4999, | |
"step": 1125 | |
}, | |
{ | |
"epoch": 1.23, | |
"grad_norm": 1.321296215057373, | |
"learning_rate": 0.00029566453447050464, | |
"loss": 0.4456, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 1.25, | |
"grad_norm": 1.4967468976974487, | |
"learning_rate": 0.0002912224591329069, | |
"loss": 0.443, | |
"step": 1175 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 1.622883915901184, | |
"learning_rate": 0.00028678038379530916, | |
"loss": 0.566, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.8475, | |
"eval_f1_macro": 0.7633448680546241, | |
"eval_f1_micro": 0.8475, | |
"eval_loss": 0.5779463648796082, | |
"eval_runtime": 24.1848, | |
"eval_samples_per_second": 413.483, | |
"eval_steps_per_second": 6.492, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 1.31, | |
"grad_norm": 2.2820820808410645, | |
"learning_rate": 0.00028233830845771145, | |
"loss": 0.5382, | |
"step": 1225 | |
}, | |
{ | |
"epoch": 1.33, | |
"grad_norm": 1.5108492374420166, | |
"learning_rate": 0.00027789623312011373, | |
"loss": 0.5314, | |
"step": 1250 | |
}, | |
{ | |
"epoch": 1.36, | |
"grad_norm": 1.2868916988372803, | |
"learning_rate": 0.00027345415778251596, | |
"loss": 0.4951, | |
"step": 1275 | |
}, | |
{ | |
"epoch": 1.39, | |
"grad_norm": 2.0262043476104736, | |
"learning_rate": 0.00026901208244491825, | |
"loss": 0.4577, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 1.39, | |
"eval_accuracy": 0.8435, | |
"eval_f1_macro": 0.7507843808294652, | |
"eval_f1_micro": 0.8435, | |
"eval_loss": 0.5771787762641907, | |
"eval_runtime": 24.1138, | |
"eval_samples_per_second": 414.7, | |
"eval_steps_per_second": 6.511, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 1.41, | |
"grad_norm": 1.7890022993087769, | |
"learning_rate": 0.00026457000710732053, | |
"loss": 0.5097, | |
"step": 1325 | |
}, | |
{ | |
"epoch": 1.44, | |
"grad_norm": 1.5777374505996704, | |
"learning_rate": 0.0002601279317697228, | |
"loss": 0.4894, | |
"step": 1350 | |
}, | |
{ | |
"epoch": 1.47, | |
"grad_norm": 1.6835675239562988, | |
"learning_rate": 0.0002556858564321251, | |
"loss": 0.4487, | |
"step": 1375 | |
}, | |
{ | |
"epoch": 1.49, | |
"grad_norm": 1.7419664859771729, | |
"learning_rate": 0.0002512437810945274, | |
"loss": 0.4233, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 1.49, | |
"eval_accuracy": 0.8476, | |
"eval_f1_macro": 0.7624728239220356, | |
"eval_f1_micro": 0.8476, | |
"eval_loss": 0.5580869913101196, | |
"eval_runtime": 24.1623, | |
"eval_samples_per_second": 413.868, | |
"eval_steps_per_second": 6.498, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 1.6330546140670776, | |
"learning_rate": 0.0002468017057569296, | |
"loss": 0.4985, | |
"step": 1425 | |
}, | |
{ | |
"epoch": 1.55, | |
"grad_norm": 1.5560388565063477, | |
"learning_rate": 0.00024235963041933193, | |
"loss": 0.4902, | |
"step": 1450 | |
}, | |
{ | |
"epoch": 1.57, | |
"grad_norm": 1.1412893533706665, | |
"learning_rate": 0.0002379175550817342, | |
"loss": 0.4811, | |
"step": 1475 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 1.4423662424087524, | |
"learning_rate": 0.00023347547974413648, | |
"loss": 0.4567, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 1.6, | |
"eval_accuracy": 0.8462, | |
"eval_f1_macro": 0.7575976290013776, | |
"eval_f1_micro": 0.8462, | |
"eval_loss": 0.5687991976737976, | |
"eval_runtime": 24.159, | |
"eval_samples_per_second": 413.925, | |
"eval_steps_per_second": 6.499, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 1.63, | |
"grad_norm": 1.9873683452606201, | |
"learning_rate": 0.00022903340440653876, | |
"loss": 0.4455, | |
"step": 1525 | |
}, | |
{ | |
"epoch": 1.65, | |
"grad_norm": 1.4635204076766968, | |
"learning_rate": 0.000224591329068941, | |
"loss": 0.4457, | |
"step": 1550 | |
}, | |
{ | |
"epoch": 1.68, | |
"grad_norm": 1.56647789478302, | |
"learning_rate": 0.00022014925373134328, | |
"loss": 0.4223, | |
"step": 1575 | |
}, | |
{ | |
"epoch": 1.71, | |
"grad_norm": 1.8017354011535645, | |
"learning_rate": 0.00021570717839374556, | |
"loss": 0.483, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 1.71, | |
"eval_accuracy": 0.8478, | |
"eval_f1_macro": 0.7608717953670078, | |
"eval_f1_micro": 0.8478, | |
"eval_loss": 0.5547010898590088, | |
"eval_runtime": 24.1023, | |
"eval_samples_per_second": 414.897, | |
"eval_steps_per_second": 6.514, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 1.73, | |
"grad_norm": 1.768385887145996, | |
"learning_rate": 0.00021126510305614785, | |
"loss": 0.4593, | |
"step": 1625 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 1.8453024625778198, | |
"learning_rate": 0.0002068230277185501, | |
"loss": 0.4378, | |
"step": 1650 | |
}, | |
{ | |
"epoch": 1.79, | |
"grad_norm": 1.5103825330734253, | |
"learning_rate": 0.0002023809523809524, | |
"loss": 0.455, | |
"step": 1675 | |
}, | |
{ | |
"epoch": 1.81, | |
"grad_norm": 1.8187497854232788, | |
"learning_rate": 0.00019793887704335468, | |
"loss": 0.4649, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 1.81, | |
"eval_accuracy": 0.851, | |
"eval_f1_macro": 0.7680217546192462, | |
"eval_f1_micro": 0.851, | |
"eval_loss": 0.5395861864089966, | |
"eval_runtime": 24.1531, | |
"eval_samples_per_second": 414.025, | |
"eval_steps_per_second": 6.5, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 1.84, | |
"grad_norm": 2.1151626110076904, | |
"learning_rate": 0.00019349680170575694, | |
"loss": 0.5057, | |
"step": 1725 | |
}, | |
{ | |
"epoch": 1.87, | |
"grad_norm": 1.4344931840896606, | |
"learning_rate": 0.00018905472636815922, | |
"loss": 0.4275, | |
"step": 1750 | |
}, | |
{ | |
"epoch": 1.89, | |
"grad_norm": 3.410400152206421, | |
"learning_rate": 0.00018461265103056148, | |
"loss": 0.4973, | |
"step": 1775 | |
}, | |
{ | |
"epoch": 1.92, | |
"grad_norm": 2.3107552528381348, | |
"learning_rate": 0.00018017057569296374, | |
"loss": 0.4288, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 1.92, | |
"eval_accuracy": 0.8577, | |
"eval_f1_macro": 0.7759139835888773, | |
"eval_f1_micro": 0.8577, | |
"eval_loss": 0.5235319137573242, | |
"eval_runtime": 24.1535, | |
"eval_samples_per_second": 414.018, | |
"eval_steps_per_second": 6.5, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 1.95, | |
"grad_norm": 1.2763726711273193, | |
"learning_rate": 0.00017572850035536602, | |
"loss": 0.5008, | |
"step": 1825 | |
}, | |
{ | |
"epoch": 1.97, | |
"grad_norm": 1.5869358777999878, | |
"learning_rate": 0.0001712864250177683, | |
"loss": 0.4581, | |
"step": 1850 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 1.450799822807312, | |
"learning_rate": 0.0001668443496801706, | |
"loss": 0.4103, | |
"step": 1875 | |
}, | |
{ | |
"epoch": 2.03, | |
"grad_norm": 1.3206963539123535, | |
"learning_rate": 0.00016240227434257285, | |
"loss": 0.3445, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 2.03, | |
"eval_accuracy": 0.8603, | |
"eval_f1_macro": 0.7790782413257752, | |
"eval_f1_micro": 0.8603, | |
"eval_loss": 0.520423948764801, | |
"eval_runtime": 24.0544, | |
"eval_samples_per_second": 415.724, | |
"eval_steps_per_second": 6.527, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 2.05, | |
"grad_norm": 1.3110324144363403, | |
"learning_rate": 0.00015796019900497514, | |
"loss": 0.3278, | |
"step": 1925 | |
}, | |
{ | |
"epoch": 2.08, | |
"grad_norm": 1.4223313331604004, | |
"learning_rate": 0.00015351812366737742, | |
"loss": 0.3376, | |
"step": 1950 | |
}, | |
{ | |
"epoch": 2.11, | |
"grad_norm": 1.6518670320510864, | |
"learning_rate": 0.00014907604832977968, | |
"loss": 0.3519, | |
"step": 1975 | |
}, | |
{ | |
"epoch": 2.13, | |
"grad_norm": 1.5389341115951538, | |
"learning_rate": 0.00014463397299218194, | |
"loss": 0.3014, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 2.13, | |
"eval_accuracy": 0.8607, | |
"eval_f1_macro": 0.786161944348828, | |
"eval_f1_micro": 0.8607, | |
"eval_loss": 0.5268532037734985, | |
"eval_runtime": 24.1182, | |
"eval_samples_per_second": 414.624, | |
"eval_steps_per_second": 6.51, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 2.16, | |
"grad_norm": 1.810062289237976, | |
"learning_rate": 0.00014019189765458422, | |
"loss": 0.3184, | |
"step": 2025 | |
}, | |
{ | |
"epoch": 2.19, | |
"grad_norm": 1.4731919765472412, | |
"learning_rate": 0.00013574982231698648, | |
"loss": 0.3457, | |
"step": 2050 | |
}, | |
{ | |
"epoch": 2.21, | |
"grad_norm": 1.8230247497558594, | |
"learning_rate": 0.00013130774697938877, | |
"loss": 0.3107, | |
"step": 2075 | |
}, | |
{ | |
"epoch": 2.24, | |
"grad_norm": 1.9620997905731201, | |
"learning_rate": 0.00012686567164179105, | |
"loss": 0.3301, | |
"step": 2100 | |
}, | |
{ | |
"epoch": 2.24, | |
"eval_accuracy": 0.8591, | |
"eval_f1_macro": 0.7826263587368261, | |
"eval_f1_micro": 0.8591, | |
"eval_loss": 0.5233541131019592, | |
"eval_runtime": 24.1529, | |
"eval_samples_per_second": 414.03, | |
"eval_steps_per_second": 6.5, | |
"step": 2100 | |
}, | |
{ | |
"epoch": 2.27, | |
"grad_norm": 1.9270436763763428, | |
"learning_rate": 0.0001224235963041933, | |
"loss": 0.3273, | |
"step": 2125 | |
}, | |
{ | |
"epoch": 2.29, | |
"grad_norm": 1.2540065050125122, | |
"learning_rate": 0.0001179815209665956, | |
"loss": 0.2682, | |
"step": 2150 | |
}, | |
{ | |
"epoch": 2.32, | |
"grad_norm": 0.6836864948272705, | |
"learning_rate": 0.00011353944562899787, | |
"loss": 0.2787, | |
"step": 2175 | |
}, | |
{ | |
"epoch": 2.35, | |
"grad_norm": 1.7758402824401855, | |
"learning_rate": 0.00010909737029140014, | |
"loss": 0.3069, | |
"step": 2200 | |
}, | |
{ | |
"epoch": 2.35, | |
"eval_accuracy": 0.8624, | |
"eval_f1_macro": 0.785058711126236, | |
"eval_f1_micro": 0.8624, | |
"eval_loss": 0.5265922546386719, | |
"eval_runtime": 24.117, | |
"eval_samples_per_second": 414.644, | |
"eval_steps_per_second": 6.51, | |
"step": 2200 | |
}, | |
{ | |
"epoch": 2.37, | |
"grad_norm": 1.3381538391113281, | |
"learning_rate": 0.00010465529495380242, | |
"loss": 0.2866, | |
"step": 2225 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 1.759792685508728, | |
"learning_rate": 0.0001002132196162047, | |
"loss": 0.3031, | |
"step": 2250 | |
}, | |
{ | |
"epoch": 2.43, | |
"grad_norm": 1.2064933776855469, | |
"learning_rate": 9.577114427860697e-05, | |
"loss": 0.3182, | |
"step": 2275 | |
}, | |
{ | |
"epoch": 2.45, | |
"grad_norm": 1.0604907274246216, | |
"learning_rate": 9.132906894100924e-05, | |
"loss": 0.3095, | |
"step": 2300 | |
}, | |
{ | |
"epoch": 2.45, | |
"eval_accuracy": 0.8629, | |
"eval_f1_macro": 0.7846414495209991, | |
"eval_f1_micro": 0.8629, | |
"eval_loss": 0.5154865980148315, | |
"eval_runtime": 24.1319, | |
"eval_samples_per_second": 414.389, | |
"eval_steps_per_second": 6.506, | |
"step": 2300 | |
}, | |
{ | |
"epoch": 2.48, | |
"grad_norm": 1.9324097633361816, | |
"learning_rate": 8.688699360341151e-05, | |
"loss": 0.3321, | |
"step": 2325 | |
}, | |
{ | |
"epoch": 2.51, | |
"grad_norm": 1.9070733785629272, | |
"learning_rate": 8.24449182658138e-05, | |
"loss": 0.3165, | |
"step": 2350 | |
}, | |
{ | |
"epoch": 2.53, | |
"grad_norm": 1.3031100034713745, | |
"learning_rate": 7.800284292821607e-05, | |
"loss": 0.2939, | |
"step": 2375 | |
}, | |
{ | |
"epoch": 2.56, | |
"grad_norm": 1.479374647140503, | |
"learning_rate": 7.356076759061834e-05, | |
"loss": 0.3164, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 2.56, | |
"eval_accuracy": 0.8646, | |
"eval_f1_macro": 0.7909189563960074, | |
"eval_f1_micro": 0.8646, | |
"eval_loss": 0.5106394290924072, | |
"eval_runtime": 24.1589, | |
"eval_samples_per_second": 413.927, | |
"eval_steps_per_second": 6.499, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 2.59, | |
"grad_norm": 1.6215256452560425, | |
"learning_rate": 6.911869225302061e-05, | |
"loss": 0.3524, | |
"step": 2425 | |
}, | |
{ | |
"epoch": 2.61, | |
"grad_norm": 1.5538333654403687, | |
"learning_rate": 6.467661691542288e-05, | |
"loss": 0.3354, | |
"step": 2450 | |
}, | |
{ | |
"epoch": 2.64, | |
"grad_norm": 1.515882134437561, | |
"learning_rate": 6.023454157782516e-05, | |
"loss": 0.2992, | |
"step": 2475 | |
}, | |
{ | |
"epoch": 2.67, | |
"grad_norm": 2.216226100921631, | |
"learning_rate": 5.579246624022743e-05, | |
"loss": 0.2914, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 2.67, | |
"eval_accuracy": 0.8647, | |
"eval_f1_macro": 0.7934095832580423, | |
"eval_f1_micro": 0.8647, | |
"eval_loss": 0.5055064558982849, | |
"eval_runtime": 24.1523, | |
"eval_samples_per_second": 414.039, | |
"eval_steps_per_second": 6.5, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 2.69, | |
"grad_norm": 2.8203630447387695, | |
"learning_rate": 5.135039090262971e-05, | |
"loss": 0.321, | |
"step": 2525 | |
}, | |
{ | |
"epoch": 2.72, | |
"grad_norm": 2.173407793045044, | |
"learning_rate": 4.690831556503199e-05, | |
"loss": 0.3, | |
"step": 2550 | |
}, | |
{ | |
"epoch": 2.75, | |
"grad_norm": 1.960857629776001, | |
"learning_rate": 4.2466240227434255e-05, | |
"loss": 0.3181, | |
"step": 2575 | |
}, | |
{ | |
"epoch": 2.77, | |
"grad_norm": 1.7531359195709229, | |
"learning_rate": 3.802416488983653e-05, | |
"loss": 0.2946, | |
"step": 2600 | |
}, | |
{ | |
"epoch": 2.77, | |
"eval_accuracy": 0.8643, | |
"eval_f1_macro": 0.7917093314024903, | |
"eval_f1_micro": 0.8643, | |
"eval_loss": 0.502694308757782, | |
"eval_runtime": 24.1319, | |
"eval_samples_per_second": 414.389, | |
"eval_steps_per_second": 6.506, | |
"step": 2600 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 1.2817922830581665, | |
"learning_rate": 3.3582089552238805e-05, | |
"loss": 0.2963, | |
"step": 2625 | |
}, | |
{ | |
"epoch": 2.83, | |
"grad_norm": 2.3058297634124756, | |
"learning_rate": 2.9140014214641083e-05, | |
"loss": 0.33, | |
"step": 2650 | |
}, | |
{ | |
"epoch": 2.85, | |
"grad_norm": 1.1777921915054321, | |
"learning_rate": 2.4697938877043355e-05, | |
"loss": 0.3122, | |
"step": 2675 | |
}, | |
{ | |
"epoch": 2.88, | |
"grad_norm": 1.6172949075698853, | |
"learning_rate": 2.025586353944563e-05, | |
"loss": 0.3012, | |
"step": 2700 | |
}, | |
{ | |
"epoch": 2.88, | |
"eval_accuracy": 0.8671, | |
"eval_f1_macro": 0.7953004112768677, | |
"eval_f1_micro": 0.8671, | |
"eval_loss": 0.500918984413147, | |
"eval_runtime": 24.1653, | |
"eval_samples_per_second": 413.817, | |
"eval_steps_per_second": 6.497, | |
"step": 2700 | |
}, | |
{ | |
"epoch": 2.91, | |
"grad_norm": 1.0941059589385986, | |
"learning_rate": 1.5813788201847902e-05, | |
"loss": 0.2869, | |
"step": 2725 | |
}, | |
{ | |
"epoch": 2.93, | |
"grad_norm": 1.700844168663025, | |
"learning_rate": 1.1371712864250177e-05, | |
"loss": 0.2968, | |
"step": 2750 | |
}, | |
{ | |
"epoch": 2.96, | |
"grad_norm": 1.8403632640838623, | |
"learning_rate": 6.929637526652452e-06, | |
"loss": 0.272, | |
"step": 2775 | |
}, | |
{ | |
"epoch": 2.99, | |
"grad_norm": 1.8172844648361206, | |
"learning_rate": 2.4875621890547264e-06, | |
"loss": 0.3181, | |
"step": 2800 | |
}, | |
{ | |
"epoch": 2.99, | |
"eval_accuracy": 0.8664, | |
"eval_f1_macro": 0.7947908970820687, | |
"eval_f1_micro": 0.8664, | |
"eval_loss": 0.5004217028617859, | |
"eval_runtime": 24.1769, | |
"eval_samples_per_second": 413.617, | |
"eval_steps_per_second": 6.494, | |
"step": 2800 | |
} | |
], | |
"logging_steps": 25, | |
"max_steps": 2814, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 100, | |
"total_flos": 2.737317734462259e+16, | |
"train_batch_size": 32, | |
"trial_name": null, | |
"trial_params": null | |
} | |