SLM_vs_LLM_experiments
/
max_seq_length_128_experiments
/google_t5
/t5_small_scotus
/trainer_state.json
{ | |
"best_metric": 1.6132301092147827, | |
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google_t5/t5_small_scotus/checkpoint-450", | |
"epoch": 3.0, | |
"eval_steps": 50, | |
"global_step": 471, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.06, | |
"grad_norm": 2.1600992679595947, | |
"learning_rate": 0.0004893842887473461, | |
"loss": 2.2233, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 1.9564061164855957, | |
"learning_rate": 0.00047876857749469217, | |
"loss": 2.1218, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.19, | |
"grad_norm": 2.9564528465270996, | |
"learning_rate": 0.0004681528662420382, | |
"loss": 2.1816, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 1.909942626953125, | |
"learning_rate": 0.00045753715498938433, | |
"loss": 2.1508, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 3.1194334030151367, | |
"learning_rate": 0.0004469214437367304, | |
"loss": 2.1902, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.16857142857142857, | |
"eval_f1_macro": 0.04145091549671702, | |
"eval_f1_micro": 0.16857142857142857, | |
"eval_loss": 2.1837778091430664, | |
"eval_runtime": 1.1322, | |
"eval_samples_per_second": 1236.51, | |
"eval_steps_per_second": 38.862, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.38, | |
"grad_norm": 1.9599010944366455, | |
"learning_rate": 0.00043630573248407644, | |
"loss": 2.1107, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 2.101072072982788, | |
"learning_rate": 0.00042569002123142254, | |
"loss": 2.0523, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.51, | |
"grad_norm": 2.113231658935547, | |
"learning_rate": 0.0004150743099787686, | |
"loss": 1.9695, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.57, | |
"grad_norm": 2.977862596511841, | |
"learning_rate": 0.0004044585987261147, | |
"loss": 1.832, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 1.943853735923767, | |
"learning_rate": 0.00039384288747346076, | |
"loss": 1.7893, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.44357142857142856, | |
"eval_f1_macro": 0.17736406726073506, | |
"eval_f1_micro": 0.44357142857142856, | |
"eval_loss": 1.8674652576446533, | |
"eval_runtime": 1.1327, | |
"eval_samples_per_second": 1235.935, | |
"eval_steps_per_second": 38.844, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 3.692411184310913, | |
"learning_rate": 0.0003832271762208068, | |
"loss": 1.7928, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.76, | |
"grad_norm": 2.646707534790039, | |
"learning_rate": 0.00037261146496815286, | |
"loss": 1.8464, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.83, | |
"grad_norm": 2.7998580932617188, | |
"learning_rate": 0.0003619957537154989, | |
"loss": 1.6814, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 2.44093656539917, | |
"learning_rate": 0.000351380042462845, | |
"loss": 1.6501, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 2.486459493637085, | |
"learning_rate": 0.0003407643312101911, | |
"loss": 1.7871, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.45285714285714285, | |
"eval_f1_macro": 0.2042937393977534, | |
"eval_f1_micro": 0.45285714285714285, | |
"eval_loss": 1.7416280508041382, | |
"eval_runtime": 1.1304, | |
"eval_samples_per_second": 1238.525, | |
"eval_steps_per_second": 38.925, | |
"step": 150 | |
}, | |
{ | |
"epoch": 1.02, | |
"grad_norm": 2.7709107398986816, | |
"learning_rate": 0.00033014861995753713, | |
"loss": 1.6403, | |
"step": 160 | |
}, | |
{ | |
"epoch": 1.08, | |
"grad_norm": 2.4056599140167236, | |
"learning_rate": 0.00031953290870488323, | |
"loss": 1.5545, | |
"step": 170 | |
}, | |
{ | |
"epoch": 1.15, | |
"grad_norm": 3.063161611557007, | |
"learning_rate": 0.0003089171974522293, | |
"loss": 1.6633, | |
"step": 180 | |
}, | |
{ | |
"epoch": 1.21, | |
"grad_norm": 2.0999534130096436, | |
"learning_rate": 0.00029830148619957534, | |
"loss": 1.6847, | |
"step": 190 | |
}, | |
{ | |
"epoch": 1.27, | |
"grad_norm": 2.199518918991089, | |
"learning_rate": 0.00028768577494692145, | |
"loss": 1.5347, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.27, | |
"eval_accuracy": 0.485, | |
"eval_f1_macro": 0.23486394838089752, | |
"eval_f1_micro": 0.485, | |
"eval_loss": 1.6757386922836304, | |
"eval_runtime": 1.1337, | |
"eval_samples_per_second": 1234.865, | |
"eval_steps_per_second": 38.81, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.34, | |
"grad_norm": 3.293074607849121, | |
"learning_rate": 0.0002770700636942675, | |
"loss": 1.5766, | |
"step": 210 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 3.4181644916534424, | |
"learning_rate": 0.0002664543524416136, | |
"loss": 1.5088, | |
"step": 220 | |
}, | |
{ | |
"epoch": 1.46, | |
"grad_norm": 2.5669760704040527, | |
"learning_rate": 0.00025583864118895966, | |
"loss": 1.5471, | |
"step": 230 | |
}, | |
{ | |
"epoch": 1.53, | |
"grad_norm": 2.050304889678955, | |
"learning_rate": 0.0002452229299363057, | |
"loss": 1.4792, | |
"step": 240 | |
}, | |
{ | |
"epoch": 1.59, | |
"grad_norm": 2.3529515266418457, | |
"learning_rate": 0.00023460721868365182, | |
"loss": 1.4821, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.59, | |
"eval_accuracy": 0.5078571428571429, | |
"eval_f1_macro": 0.26062104260053665, | |
"eval_f1_micro": 0.5078571428571429, | |
"eval_loss": 1.66255784034729, | |
"eval_runtime": 1.1363, | |
"eval_samples_per_second": 1232.077, | |
"eval_steps_per_second": 38.722, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.66, | |
"grad_norm": 2.6943764686584473, | |
"learning_rate": 0.0002239915074309979, | |
"loss": 1.5321, | |
"step": 260 | |
}, | |
{ | |
"epoch": 1.72, | |
"grad_norm": 2.6862621307373047, | |
"learning_rate": 0.00021337579617834395, | |
"loss": 1.5375, | |
"step": 270 | |
}, | |
{ | |
"epoch": 1.78, | |
"grad_norm": 2.69085431098938, | |
"learning_rate": 0.00020276008492569003, | |
"loss": 1.4125, | |
"step": 280 | |
}, | |
{ | |
"epoch": 1.85, | |
"grad_norm": 2.574335813522339, | |
"learning_rate": 0.0001921443736730361, | |
"loss": 1.4882, | |
"step": 290 | |
}, | |
{ | |
"epoch": 1.91, | |
"grad_norm": 2.3319945335388184, | |
"learning_rate": 0.00018152866242038217, | |
"loss": 1.3521, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.91, | |
"eval_accuracy": 0.5064285714285715, | |
"eval_f1_macro": 0.26800393899192637, | |
"eval_f1_micro": 0.5064285714285715, | |
"eval_loss": 1.6865381002426147, | |
"eval_runtime": 1.1387, | |
"eval_samples_per_second": 1229.429, | |
"eval_steps_per_second": 38.639, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.97, | |
"grad_norm": 2.8269996643066406, | |
"learning_rate": 0.00017091295116772822, | |
"loss": 1.446, | |
"step": 310 | |
}, | |
{ | |
"epoch": 2.04, | |
"grad_norm": 2.464341163635254, | |
"learning_rate": 0.0001602972399150743, | |
"loss": 1.3987, | |
"step": 320 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 3.2286934852600098, | |
"learning_rate": 0.00014968152866242038, | |
"loss": 1.3692, | |
"step": 330 | |
}, | |
{ | |
"epoch": 2.17, | |
"grad_norm": 3.1505677700042725, | |
"learning_rate": 0.00013906581740976646, | |
"loss": 1.4589, | |
"step": 340 | |
}, | |
{ | |
"epoch": 2.23, | |
"grad_norm": 3.5956990718841553, | |
"learning_rate": 0.00012845010615711254, | |
"loss": 1.3616, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.23, | |
"eval_accuracy": 0.5092857142857142, | |
"eval_f1_macro": 0.29308687082984397, | |
"eval_f1_micro": 0.5092857142857142, | |
"eval_loss": 1.6213804483413696, | |
"eval_runtime": 1.143, | |
"eval_samples_per_second": 1224.85, | |
"eval_steps_per_second": 38.495, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.29, | |
"grad_norm": 1.9094513654708862, | |
"learning_rate": 0.0001178343949044586, | |
"loss": 1.3435, | |
"step": 360 | |
}, | |
{ | |
"epoch": 2.36, | |
"grad_norm": 3.655705451965332, | |
"learning_rate": 0.00010721868365180467, | |
"loss": 1.303, | |
"step": 370 | |
}, | |
{ | |
"epoch": 2.42, | |
"grad_norm": 2.700932741165161, | |
"learning_rate": 9.660297239915075e-05, | |
"loss": 1.4455, | |
"step": 380 | |
}, | |
{ | |
"epoch": 2.48, | |
"grad_norm": 2.7470288276672363, | |
"learning_rate": 8.598726114649682e-05, | |
"loss": 1.3698, | |
"step": 390 | |
}, | |
{ | |
"epoch": 2.55, | |
"grad_norm": 3.352484941482544, | |
"learning_rate": 7.537154989384288e-05, | |
"loss": 1.2932, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.55, | |
"eval_accuracy": 0.5171428571428571, | |
"eval_f1_macro": 0.2861431037809299, | |
"eval_f1_micro": 0.5171428571428571, | |
"eval_loss": 1.6142219305038452, | |
"eval_runtime": 1.1495, | |
"eval_samples_per_second": 1217.909, | |
"eval_steps_per_second": 38.277, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.61, | |
"grad_norm": 3.296964168548584, | |
"learning_rate": 6.475583864118896e-05, | |
"loss": 1.4115, | |
"step": 410 | |
}, | |
{ | |
"epoch": 2.68, | |
"grad_norm": 3.244579553604126, | |
"learning_rate": 5.414012738853504e-05, | |
"loss": 1.4115, | |
"step": 420 | |
}, | |
{ | |
"epoch": 2.74, | |
"grad_norm": 3.014683961868286, | |
"learning_rate": 4.35244161358811e-05, | |
"loss": 1.2962, | |
"step": 430 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 2.6964550018310547, | |
"learning_rate": 3.2908704883227177e-05, | |
"loss": 1.249, | |
"step": 440 | |
}, | |
{ | |
"epoch": 2.87, | |
"grad_norm": 2.9605798721313477, | |
"learning_rate": 2.2292993630573246e-05, | |
"loss": 1.3028, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.87, | |
"eval_accuracy": 0.515, | |
"eval_f1_macro": 0.2879233006589316, | |
"eval_f1_micro": 0.515, | |
"eval_loss": 1.6132301092147827, | |
"eval_runtime": 1.2104, | |
"eval_samples_per_second": 1156.656, | |
"eval_steps_per_second": 36.352, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.93, | |
"grad_norm": 3.9791924953460693, | |
"learning_rate": 1.167728237791932e-05, | |
"loss": 1.2799, | |
"step": 460 | |
}, | |
{ | |
"epoch": 2.99, | |
"grad_norm": 3.5757694244384766, | |
"learning_rate": 1.0615711252653927e-06, | |
"loss": 1.3862, | |
"step": 470 | |
}, | |
{ | |
"epoch": 3.0, | |
"step": 471, | |
"total_flos": 513085422239744.0, | |
"train_loss": 1.6059220459810488, | |
"train_runtime": 55.8898, | |
"train_samples_per_second": 268.386, | |
"train_steps_per_second": 8.427 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 471, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 50, | |
"total_flos": 513085422239744.0, | |
"train_batch_size": 16, | |
"trial_name": null, | |
"trial_params": null | |
} | |