SLM_vs_LLM_experiments
/
max_seq_length_128_experiments
/LoRA
/Qwen
/Qwen1.5_7B_LoRA_coastalcph
/lex_glue
/trainer_state.json
{ | |
"best_metric": 1.6124553680419922, | |
"best_model_checkpoint": "../experiments_checkpoints/LoRA/Qwen/Qwen1.5_7B_LoRA_coastalcph/lex_glue/checkpoint-400", | |
"epoch": 3.0, | |
"eval_steps": 50, | |
"global_step": 471, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.06, | |
"grad_norm": 83.9561996459961, | |
"learning_rate": 4.893842887473461e-05, | |
"loss": 8.475, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 91.12702941894531, | |
"learning_rate": 4.787685774946922e-05, | |
"loss": 5.743, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.19, | |
"grad_norm": 89.63079071044922, | |
"learning_rate": 4.681528662420383e-05, | |
"loss": 4.143, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 232.11468505859375, | |
"learning_rate": 4.575371549893843e-05, | |
"loss": 2.7887, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 57.7495231628418, | |
"learning_rate": 4.469214437367304e-05, | |
"loss": 2.3973, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.38, | |
"eval_f1_macro": 0.16768233889185707, | |
"eval_f1_micro": 0.38, | |
"eval_loss": 2.194821357727051, | |
"eval_runtime": 17.0388, | |
"eval_samples_per_second": 82.165, | |
"eval_steps_per_second": 2.582, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.38, | |
"grad_norm": 60.894527435302734, | |
"learning_rate": 4.3630573248407646e-05, | |
"loss": 1.8434, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 33.58325958251953, | |
"learning_rate": 4.256900212314226e-05, | |
"loss": 1.732, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.51, | |
"grad_norm": 63.34943771362305, | |
"learning_rate": 4.150743099787686e-05, | |
"loss": 1.8363, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.57, | |
"grad_norm": 46.44338607788086, | |
"learning_rate": 4.044585987261147e-05, | |
"loss": 1.7555, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 43.241111755371094, | |
"learning_rate": 3.9384288747346076e-05, | |
"loss": 1.6438, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.42714285714285716, | |
"eval_f1_macro": 0.24662591434994435, | |
"eval_f1_micro": 0.42714285714285716, | |
"eval_loss": 1.8118304014205933, | |
"eval_runtime": 17.1543, | |
"eval_samples_per_second": 81.612, | |
"eval_steps_per_second": 2.565, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 112.62821197509766, | |
"learning_rate": 3.8322717622080686e-05, | |
"loss": 1.7039, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.76, | |
"grad_norm": 59.15960693359375, | |
"learning_rate": 3.7261146496815283e-05, | |
"loss": 1.7463, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.83, | |
"grad_norm": 64.70002746582031, | |
"learning_rate": 3.6199575371549894e-05, | |
"loss": 1.6117, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 66.38226318359375, | |
"learning_rate": 3.51380042462845e-05, | |
"loss": 1.6342, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 82.49270629882812, | |
"learning_rate": 3.407643312101911e-05, | |
"loss": 1.7379, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.47714285714285715, | |
"eval_f1_macro": 0.27038446232193303, | |
"eval_f1_micro": 0.47714285714285715, | |
"eval_loss": 1.7119196653366089, | |
"eval_runtime": 17.2159, | |
"eval_samples_per_second": 81.32, | |
"eval_steps_per_second": 2.556, | |
"step": 150 | |
}, | |
{ | |
"epoch": 1.02, | |
"grad_norm": 72.54532623291016, | |
"learning_rate": 3.301486199575371e-05, | |
"loss": 1.5684, | |
"step": 160 | |
}, | |
{ | |
"epoch": 1.08, | |
"grad_norm": 43.69814682006836, | |
"learning_rate": 3.1953290870488323e-05, | |
"loss": 1.4182, | |
"step": 170 | |
}, | |
{ | |
"epoch": 1.15, | |
"grad_norm": 101.71517181396484, | |
"learning_rate": 3.089171974522293e-05, | |
"loss": 1.4734, | |
"step": 180 | |
}, | |
{ | |
"epoch": 1.21, | |
"grad_norm": 75.00550842285156, | |
"learning_rate": 2.9830148619957538e-05, | |
"loss": 1.4867, | |
"step": 190 | |
}, | |
{ | |
"epoch": 1.27, | |
"grad_norm": 72.43107604980469, | |
"learning_rate": 2.8768577494692145e-05, | |
"loss": 1.409, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.27, | |
"eval_accuracy": 0.48714285714285716, | |
"eval_f1_macro": 0.2973212519943923, | |
"eval_f1_micro": 0.48714285714285716, | |
"eval_loss": 1.748794674873352, | |
"eval_runtime": 17.186, | |
"eval_samples_per_second": 81.461, | |
"eval_steps_per_second": 2.56, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.34, | |
"grad_norm": 53.7850456237793, | |
"learning_rate": 2.7707006369426753e-05, | |
"loss": 1.5008, | |
"step": 210 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 83.63137817382812, | |
"learning_rate": 2.664543524416136e-05, | |
"loss": 1.3074, | |
"step": 220 | |
}, | |
{ | |
"epoch": 1.46, | |
"grad_norm": 81.88774108886719, | |
"learning_rate": 2.5583864118895967e-05, | |
"loss": 1.3816, | |
"step": 230 | |
}, | |
{ | |
"epoch": 1.53, | |
"grad_norm": 35.77708053588867, | |
"learning_rate": 2.4522292993630575e-05, | |
"loss": 1.2949, | |
"step": 240 | |
}, | |
{ | |
"epoch": 1.59, | |
"grad_norm": 66.99419403076172, | |
"learning_rate": 2.3460721868365182e-05, | |
"loss": 1.2443, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.59, | |
"eval_accuracy": 0.5364285714285715, | |
"eval_f1_macro": 0.33337140604249654, | |
"eval_f1_micro": 0.5364285714285715, | |
"eval_loss": 1.6798213720321655, | |
"eval_runtime": 17.2342, | |
"eval_samples_per_second": 81.234, | |
"eval_steps_per_second": 2.553, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.66, | |
"grad_norm": 42.91401290893555, | |
"learning_rate": 2.239915074309979e-05, | |
"loss": 1.3525, | |
"step": 260 | |
}, | |
{ | |
"epoch": 1.72, | |
"grad_norm": 43.42304611206055, | |
"learning_rate": 2.1337579617834397e-05, | |
"loss": 1.324, | |
"step": 270 | |
}, | |
{ | |
"epoch": 1.78, | |
"grad_norm": 53.237369537353516, | |
"learning_rate": 2.0276008492569004e-05, | |
"loss": 1.2523, | |
"step": 280 | |
}, | |
{ | |
"epoch": 1.85, | |
"grad_norm": 71.94126892089844, | |
"learning_rate": 1.921443736730361e-05, | |
"loss": 1.3336, | |
"step": 290 | |
}, | |
{ | |
"epoch": 1.91, | |
"grad_norm": 39.801326751708984, | |
"learning_rate": 1.8152866242038215e-05, | |
"loss": 1.1602, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.91, | |
"eval_accuracy": 0.5242857142857142, | |
"eval_f1_macro": 0.35727383145505925, | |
"eval_f1_micro": 0.5242857142857142, | |
"eval_loss": 1.6131696701049805, | |
"eval_runtime": 17.2122, | |
"eval_samples_per_second": 81.337, | |
"eval_steps_per_second": 2.556, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.97, | |
"grad_norm": 40.524967193603516, | |
"learning_rate": 1.7091295116772823e-05, | |
"loss": 1.3412, | |
"step": 310 | |
}, | |
{ | |
"epoch": 2.04, | |
"grad_norm": 53.3381462097168, | |
"learning_rate": 1.602972399150743e-05, | |
"loss": 1.0643, | |
"step": 320 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 37.6945686340332, | |
"learning_rate": 1.4968152866242039e-05, | |
"loss": 1.0189, | |
"step": 330 | |
}, | |
{ | |
"epoch": 2.17, | |
"grad_norm": 36.366859436035156, | |
"learning_rate": 1.3906581740976646e-05, | |
"loss": 1.1482, | |
"step": 340 | |
}, | |
{ | |
"epoch": 2.23, | |
"grad_norm": 66.20160675048828, | |
"learning_rate": 1.2845010615711253e-05, | |
"loss": 1.1191, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.23, | |
"eval_accuracy": 0.5385714285714286, | |
"eval_f1_macro": 0.3914097926983185, | |
"eval_f1_micro": 0.5385714285714286, | |
"eval_loss": 1.6507365703582764, | |
"eval_runtime": 17.2629, | |
"eval_samples_per_second": 81.099, | |
"eval_steps_per_second": 2.549, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.29, | |
"grad_norm": 22.359830856323242, | |
"learning_rate": 1.178343949044586e-05, | |
"loss": 0.8946, | |
"step": 360 | |
}, | |
{ | |
"epoch": 2.36, | |
"grad_norm": 64.09331512451172, | |
"learning_rate": 1.0721868365180468e-05, | |
"loss": 0.9666, | |
"step": 370 | |
}, | |
{ | |
"epoch": 2.42, | |
"grad_norm": 74.93325805664062, | |
"learning_rate": 9.660297239915075e-06, | |
"loss": 1.0045, | |
"step": 380 | |
}, | |
{ | |
"epoch": 2.48, | |
"grad_norm": 67.56449127197266, | |
"learning_rate": 8.598726114649681e-06, | |
"loss": 0.9639, | |
"step": 390 | |
}, | |
{ | |
"epoch": 2.55, | |
"grad_norm": 34.843387603759766, | |
"learning_rate": 7.537154989384289e-06, | |
"loss": 0.8907, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.55, | |
"eval_accuracy": 0.5507142857142857, | |
"eval_f1_macro": 0.4051157419199404, | |
"eval_f1_micro": 0.5507142857142857, | |
"eval_loss": 1.6124553680419922, | |
"eval_runtime": 17.2236, | |
"eval_samples_per_second": 81.284, | |
"eval_steps_per_second": 2.555, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.61, | |
"grad_norm": 28.607370376586914, | |
"learning_rate": 6.4755838641188965e-06, | |
"loss": 1.0082, | |
"step": 410 | |
}, | |
{ | |
"epoch": 2.68, | |
"grad_norm": 57.64848327636719, | |
"learning_rate": 5.414012738853504e-06, | |
"loss": 1.0316, | |
"step": 420 | |
}, | |
{ | |
"epoch": 2.74, | |
"grad_norm": 37.1605224609375, | |
"learning_rate": 4.35244161358811e-06, | |
"loss": 0.9497, | |
"step": 430 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 53.83654022216797, | |
"learning_rate": 3.2908704883227177e-06, | |
"loss": 0.8637, | |
"step": 440 | |
}, | |
{ | |
"epoch": 2.87, | |
"grad_norm": 43.51070022583008, | |
"learning_rate": 2.229299363057325e-06, | |
"loss": 0.9012, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.87, | |
"eval_accuracy": 0.5528571428571428, | |
"eval_f1_macro": 0.4087663891394323, | |
"eval_f1_micro": 0.5528571428571428, | |
"eval_loss": 1.644508957862854, | |
"eval_runtime": 17.2223, | |
"eval_samples_per_second": 81.29, | |
"eval_steps_per_second": 2.555, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.93, | |
"grad_norm": 41.90240478515625, | |
"learning_rate": 1.167728237791932e-06, | |
"loss": 0.9139, | |
"step": 460 | |
}, | |
{ | |
"epoch": 2.99, | |
"grad_norm": 49.19029235839844, | |
"learning_rate": 1.0615711252653928e-07, | |
"loss": 0.9156, | |
"step": 470 | |
}, | |
{ | |
"epoch": 3.0, | |
"step": 471, | |
"total_flos": 7.867250325323776e+16, | |
"train_loss": 1.6644585157908705, | |
"train_runtime": 739.0576, | |
"train_samples_per_second": 20.296, | |
"train_steps_per_second": 0.637 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 471, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 50, | |
"total_flos": 7.867250325323776e+16, | |
"train_batch_size": 16, | |
"trial_name": null, | |
"trial_params": null | |
} | |