SLM_vs_LLM_experiments
/
max_seq_length_128_experiments
/google_bert
/bert_base_uncased_twitter
/trainer_state.json
{ | |
"best_metric": 0.47801119089126587, | |
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/google_bert/bert_base_uncased_twitter/checkpoint-100", | |
"epoch": 3.0, | |
"eval_steps": 50, | |
"global_step": 408, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.07, | |
"grad_norm": 2.030212163925171, | |
"learning_rate": 1.950980392156863e-05, | |
"loss": 0.6218, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.15, | |
"grad_norm": 2.344938278198242, | |
"learning_rate": 1.9019607843137255e-05, | |
"loss": 0.5741, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.22, | |
"grad_norm": 5.662298679351807, | |
"learning_rate": 1.8529411764705884e-05, | |
"loss": 0.4956, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.29, | |
"grad_norm": 2.6259591579437256, | |
"learning_rate": 1.8039215686274513e-05, | |
"loss": 0.4936, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 3.3229475021362305, | |
"learning_rate": 1.7549019607843138e-05, | |
"loss": 0.4689, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.37, | |
"eval_accuracy": 0.7582720588235294, | |
"eval_f1_macro": 0.7184904910459111, | |
"eval_f1_micro": 0.7582720588235294, | |
"eval_loss": 0.48763757944107056, | |
"eval_runtime": 1.0259, | |
"eval_samples_per_second": 1060.508, | |
"eval_steps_per_second": 16.57, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.44, | |
"grad_norm": 2.529259443283081, | |
"learning_rate": 1.7058823529411767e-05, | |
"loss": 0.4913, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.51, | |
"grad_norm": 3.275548219680786, | |
"learning_rate": 1.6568627450980395e-05, | |
"loss": 0.4695, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.59, | |
"grad_norm": 3.004589319229126, | |
"learning_rate": 1.607843137254902e-05, | |
"loss": 0.4803, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.66, | |
"grad_norm": 5.80615758895874, | |
"learning_rate": 1.558823529411765e-05, | |
"loss": 0.5044, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.74, | |
"grad_norm": 2.453000545501709, | |
"learning_rate": 1.5098039215686276e-05, | |
"loss": 0.4675, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.74, | |
"eval_accuracy": 0.7766544117647058, | |
"eval_f1_macro": 0.7415457166235069, | |
"eval_f1_micro": 0.7766544117647058, | |
"eval_loss": 0.47801119089126587, | |
"eval_runtime": 1.024, | |
"eval_samples_per_second": 1062.456, | |
"eval_steps_per_second": 16.601, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.81, | |
"grad_norm": 2.884126901626587, | |
"learning_rate": 1.4607843137254903e-05, | |
"loss": 0.4792, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 2.7994229793548584, | |
"learning_rate": 1.4117647058823532e-05, | |
"loss": 0.4469, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 3.1235368251800537, | |
"learning_rate": 1.3627450980392158e-05, | |
"loss": 0.4522, | |
"step": 130 | |
}, | |
{ | |
"epoch": 1.03, | |
"grad_norm": 2.8174312114715576, | |
"learning_rate": 1.3137254901960785e-05, | |
"loss": 0.4587, | |
"step": 140 | |
}, | |
{ | |
"epoch": 1.1, | |
"grad_norm": 2.3226852416992188, | |
"learning_rate": 1.2647058823529412e-05, | |
"loss": 0.4489, | |
"step": 150 | |
}, | |
{ | |
"epoch": 1.1, | |
"eval_accuracy": 0.7775735294117647, | |
"eval_f1_macro": 0.7440019912339039, | |
"eval_f1_micro": 0.7775735294117647, | |
"eval_loss": 0.48032334446907043, | |
"eval_runtime": 1.0301, | |
"eval_samples_per_second": 1056.244, | |
"eval_steps_per_second": 16.504, | |
"step": 150 | |
}, | |
{ | |
"epoch": 1.18, | |
"grad_norm": 2.9358010292053223, | |
"learning_rate": 1.215686274509804e-05, | |
"loss": 0.4203, | |
"step": 160 | |
}, | |
{ | |
"epoch": 1.25, | |
"grad_norm": 2.282562017440796, | |
"learning_rate": 1.1666666666666668e-05, | |
"loss": 0.4169, | |
"step": 170 | |
}, | |
{ | |
"epoch": 1.32, | |
"grad_norm": 4.00754976272583, | |
"learning_rate": 1.1176470588235295e-05, | |
"loss": 0.3986, | |
"step": 180 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 4.509845733642578, | |
"learning_rate": 1.0686274509803922e-05, | |
"loss": 0.4052, | |
"step": 190 | |
}, | |
{ | |
"epoch": 1.47, | |
"grad_norm": 2.2166378498077393, | |
"learning_rate": 1.0196078431372549e-05, | |
"loss": 0.457, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.47, | |
"eval_accuracy": 0.7757352941176471, | |
"eval_f1_macro": 0.7481634387711735, | |
"eval_f1_micro": 0.7757352941176471, | |
"eval_loss": 0.48201388120651245, | |
"eval_runtime": 1.0355, | |
"eval_samples_per_second": 1050.674, | |
"eval_steps_per_second": 16.417, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.54, | |
"grad_norm": 3.562760591506958, | |
"learning_rate": 9.705882352941177e-06, | |
"loss": 0.4195, | |
"step": 210 | |
}, | |
{ | |
"epoch": 1.62, | |
"grad_norm": 2.116307258605957, | |
"learning_rate": 9.215686274509804e-06, | |
"loss": 0.484, | |
"step": 220 | |
}, | |
{ | |
"epoch": 1.69, | |
"grad_norm": 1.9556690454483032, | |
"learning_rate": 8.725490196078433e-06, | |
"loss": 0.383, | |
"step": 230 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 2.6515090465545654, | |
"learning_rate": 8.23529411764706e-06, | |
"loss": 0.4325, | |
"step": 240 | |
}, | |
{ | |
"epoch": 1.84, | |
"grad_norm": 1.9742746353149414, | |
"learning_rate": 7.745098039215687e-06, | |
"loss": 0.44, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.84, | |
"eval_accuracy": 0.7830882352941176, | |
"eval_f1_macro": 0.7429348326665626, | |
"eval_f1_micro": 0.7830882352941176, | |
"eval_loss": 0.48569220304489136, | |
"eval_runtime": 1.0419, | |
"eval_samples_per_second": 1044.29, | |
"eval_steps_per_second": 16.317, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.91, | |
"grad_norm": 2.2601194381713867, | |
"learning_rate": 7.2549019607843145e-06, | |
"loss": 0.4035, | |
"step": 260 | |
}, | |
{ | |
"epoch": 1.99, | |
"grad_norm": 2.3561835289001465, | |
"learning_rate": 6.764705882352942e-06, | |
"loss": 0.3891, | |
"step": 270 | |
}, | |
{ | |
"epoch": 2.06, | |
"grad_norm": 2.1189048290252686, | |
"learning_rate": 6.274509803921569e-06, | |
"loss": 0.4035, | |
"step": 280 | |
}, | |
{ | |
"epoch": 2.13, | |
"grad_norm": 3.31033992767334, | |
"learning_rate": 5.784313725490197e-06, | |
"loss": 0.4191, | |
"step": 290 | |
}, | |
{ | |
"epoch": 2.21, | |
"grad_norm": 2.7653210163116455, | |
"learning_rate": 5.294117647058824e-06, | |
"loss": 0.3905, | |
"step": 300 | |
}, | |
{ | |
"epoch": 2.21, | |
"eval_accuracy": 0.7738970588235294, | |
"eval_f1_macro": 0.7405616666214314, | |
"eval_f1_micro": 0.7738970588235294, | |
"eval_loss": 0.4835166931152344, | |
"eval_runtime": 1.0487, | |
"eval_samples_per_second": 1037.452, | |
"eval_steps_per_second": 16.21, | |
"step": 300 | |
}, | |
{ | |
"epoch": 2.28, | |
"grad_norm": 3.1273460388183594, | |
"learning_rate": 4.803921568627452e-06, | |
"loss": 0.3902, | |
"step": 310 | |
}, | |
{ | |
"epoch": 2.35, | |
"grad_norm": 3.2511351108551025, | |
"learning_rate": 4.313725490196079e-06, | |
"loss": 0.37, | |
"step": 320 | |
}, | |
{ | |
"epoch": 2.43, | |
"grad_norm": 2.523207426071167, | |
"learning_rate": 3.8235294117647055e-06, | |
"loss": 0.4135, | |
"step": 330 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 2.4263217449188232, | |
"learning_rate": 3.3333333333333333e-06, | |
"loss": 0.3874, | |
"step": 340 | |
}, | |
{ | |
"epoch": 2.57, | |
"grad_norm": 3.537816047668457, | |
"learning_rate": 2.843137254901961e-06, | |
"loss": 0.4276, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.57, | |
"eval_accuracy": 0.7711397058823529, | |
"eval_f1_macro": 0.7452242237864831, | |
"eval_f1_micro": 0.7711397058823529, | |
"eval_loss": 0.48979371786117554, | |
"eval_runtime": 1.0517, | |
"eval_samples_per_second": 1034.501, | |
"eval_steps_per_second": 16.164, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.65, | |
"grad_norm": 2.6675004959106445, | |
"learning_rate": 2.3529411764705885e-06, | |
"loss": 0.3986, | |
"step": 360 | |
}, | |
{ | |
"epoch": 2.72, | |
"grad_norm": 2.420177936553955, | |
"learning_rate": 1.8627450980392158e-06, | |
"loss": 0.3675, | |
"step": 370 | |
}, | |
{ | |
"epoch": 2.79, | |
"grad_norm": 2.717481851577759, | |
"learning_rate": 1.3725490196078434e-06, | |
"loss": 0.3728, | |
"step": 380 | |
}, | |
{ | |
"epoch": 2.87, | |
"grad_norm": 2.846869468688965, | |
"learning_rate": 8.823529411764707e-07, | |
"loss": 0.405, | |
"step": 390 | |
}, | |
{ | |
"epoch": 2.94, | |
"grad_norm": 2.993429660797119, | |
"learning_rate": 3.921568627450981e-07, | |
"loss": 0.3413, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.94, | |
"eval_accuracy": 0.7757352941176471, | |
"eval_f1_macro": 0.7467643467643468, | |
"eval_f1_micro": 0.7757352941176471, | |
"eval_loss": 0.4929259121417999, | |
"eval_runtime": 1.056, | |
"eval_samples_per_second": 1030.33, | |
"eval_steps_per_second": 16.099, | |
"step": 400 | |
}, | |
{ | |
"epoch": 3.0, | |
"step": 408, | |
"total_flos": 1717588929282048.0, | |
"train_loss": 0.43563563099094466, | |
"train_runtime": 91.812, | |
"train_samples_per_second": 284.276, | |
"train_steps_per_second": 4.444 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 408, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 50, | |
"total_flos": 1717588929282048.0, | |
"train_batch_size": 32, | |
"trial_name": null, | |
"trial_params": null | |
} | |