adapters-llama2-bnb8-QLORA-super_glue-boolq / trainer_state-llama2-bnb8-QLORA-super_glue-boolq-sequence_classification.json
RMHalak's picture
Task: SequenceClassification
d1847d2 verified
raw
history blame contribute delete
No virus
49 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.984,
"eval_steps": 1,
"global_step": 124,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 1.1733,
"step": 1
},
{
"epoch": 0.016,
"eval_accuracy": 0.364,
"eval_loss": 1.2354755401611328,
"eval_runtime": 11.4949,
"eval_samples_per_second": 21.749,
"eval_steps_per_second": 2.784,
"step": 1
},
{
"epoch": 0.032,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 1.1385,
"step": 2
},
{
"epoch": 0.032,
"eval_accuracy": 0.364,
"eval_loss": 1.2354755401611328,
"eval_runtime": 11.3512,
"eval_samples_per_second": 22.024,
"eval_steps_per_second": 2.819,
"step": 2
},
{
"epoch": 0.048,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 1.1504,
"step": 3
},
{
"epoch": 0.048,
"eval_accuracy": 0.364,
"eval_loss": 1.2354755401611328,
"eval_runtime": 11.554,
"eval_samples_per_second": 21.638,
"eval_steps_per_second": 2.77,
"step": 3
},
{
"epoch": 0.064,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 1.4467,
"step": 4
},
{
"epoch": 0.064,
"eval_accuracy": 0.364,
"eval_loss": 1.2354755401611328,
"eval_runtime": 11.2846,
"eval_samples_per_second": 22.154,
"eval_steps_per_second": 2.836,
"step": 4
},
{
"epoch": 0.08,
"grad_norm": Infinity,
"learning_rate": 0.0,
"loss": 1.0915,
"step": 5
},
{
"epoch": 0.08,
"eval_accuracy": 0.364,
"eval_loss": 1.2354755401611328,
"eval_runtime": 11.2925,
"eval_samples_per_second": 22.138,
"eval_steps_per_second": 2.834,
"step": 5
},
{
"epoch": 0.096,
"grad_norm": 87.5323715209961,
"learning_rate": 2.5e-05,
"loss": 1.3424,
"step": 6
},
{
"epoch": 0.096,
"eval_accuracy": 0.364,
"eval_loss": 1.2354755401611328,
"eval_runtime": 11.311,
"eval_samples_per_second": 22.102,
"eval_steps_per_second": 2.829,
"step": 6
},
{
"epoch": 0.112,
"grad_norm": 50.48843765258789,
"learning_rate": 5e-05,
"loss": 1.1041,
"step": 7
},
{
"epoch": 0.112,
"eval_accuracy": 0.432,
"eval_loss": 1.0213314294815063,
"eval_runtime": 11.2832,
"eval_samples_per_second": 22.157,
"eval_steps_per_second": 2.836,
"step": 7
},
{
"epoch": 0.128,
"grad_norm": 12.972390174865723,
"learning_rate": 4.959016393442623e-05,
"loss": 0.751,
"step": 8
},
{
"epoch": 0.128,
"eval_accuracy": 0.456,
"eval_loss": 0.9333825707435608,
"eval_runtime": 11.3015,
"eval_samples_per_second": 22.121,
"eval_steps_per_second": 2.831,
"step": 8
},
{
"epoch": 0.144,
"grad_norm": 38.46497344970703,
"learning_rate": 4.918032786885246e-05,
"loss": 0.9293,
"step": 9
},
{
"epoch": 0.144,
"eval_accuracy": 0.536,
"eval_loss": 0.9040337800979614,
"eval_runtime": 11.3267,
"eval_samples_per_second": 22.072,
"eval_steps_per_second": 2.825,
"step": 9
},
{
"epoch": 0.16,
"grad_norm": 9.813628196716309,
"learning_rate": 4.8770491803278687e-05,
"loss": 0.6036,
"step": 10
},
{
"epoch": 0.16,
"eval_accuracy": 0.58,
"eval_loss": 1.0835610628128052,
"eval_runtime": 11.3275,
"eval_samples_per_second": 22.07,
"eval_steps_per_second": 2.825,
"step": 10
},
{
"epoch": 0.176,
"grad_norm": 11.096491813659668,
"learning_rate": 4.836065573770492e-05,
"loss": 0.9184,
"step": 11
},
{
"epoch": 0.176,
"eval_accuracy": 0.596,
"eval_loss": 1.2577338218688965,
"eval_runtime": 11.2856,
"eval_samples_per_second": 22.152,
"eval_steps_per_second": 2.835,
"step": 11
},
{
"epoch": 0.192,
"grad_norm": 45.111083984375,
"learning_rate": 4.795081967213115e-05,
"loss": 0.8972,
"step": 12
},
{
"epoch": 0.192,
"eval_accuracy": 0.612,
"eval_loss": 1.280572533607483,
"eval_runtime": 11.296,
"eval_samples_per_second": 22.132,
"eval_steps_per_second": 2.833,
"step": 12
},
{
"epoch": 0.208,
"grad_norm": 54.95970153808594,
"learning_rate": 4.754098360655738e-05,
"loss": 1.1253,
"step": 13
},
{
"epoch": 0.208,
"eval_accuracy": 0.612,
"eval_loss": 1.1115046739578247,
"eval_runtime": 11.3216,
"eval_samples_per_second": 22.082,
"eval_steps_per_second": 2.826,
"step": 13
},
{
"epoch": 0.224,
"grad_norm": 19.976964950561523,
"learning_rate": 4.713114754098361e-05,
"loss": 0.9591,
"step": 14
},
{
"epoch": 0.224,
"eval_accuracy": 0.584,
"eval_loss": 0.9410788416862488,
"eval_runtime": 11.3197,
"eval_samples_per_second": 22.085,
"eval_steps_per_second": 2.827,
"step": 14
},
{
"epoch": 0.24,
"grad_norm": 21.54231834411621,
"learning_rate": 4.672131147540984e-05,
"loss": 0.4318,
"step": 15
},
{
"epoch": 0.24,
"eval_accuracy": 0.504,
"eval_loss": 0.8608736395835876,
"eval_runtime": 11.2959,
"eval_samples_per_second": 22.132,
"eval_steps_per_second": 2.833,
"step": 15
},
{
"epoch": 0.256,
"grad_norm": 31.807493209838867,
"learning_rate": 4.631147540983607e-05,
"loss": 0.7278,
"step": 16
},
{
"epoch": 0.256,
"eval_accuracy": 0.52,
"eval_loss": 0.8945115804672241,
"eval_runtime": 11.3237,
"eval_samples_per_second": 22.078,
"eval_steps_per_second": 2.826,
"step": 16
},
{
"epoch": 0.272,
"grad_norm": 27.370149612426758,
"learning_rate": 4.59016393442623e-05,
"loss": 0.8711,
"step": 17
},
{
"epoch": 0.272,
"eval_accuracy": 0.524,
"eval_loss": 0.8897060751914978,
"eval_runtime": 11.3355,
"eval_samples_per_second": 22.055,
"eval_steps_per_second": 2.823,
"step": 17
},
{
"epoch": 0.288,
"grad_norm": 40.707698822021484,
"learning_rate": 4.549180327868853e-05,
"loss": 0.9991,
"step": 18
},
{
"epoch": 0.288,
"eval_accuracy": 0.548,
"eval_loss": 0.8214626312255859,
"eval_runtime": 11.3233,
"eval_samples_per_second": 22.078,
"eval_steps_per_second": 2.826,
"step": 18
},
{
"epoch": 0.304,
"grad_norm": 13.598702430725098,
"learning_rate": 4.508196721311476e-05,
"loss": 0.807,
"step": 19
},
{
"epoch": 0.304,
"eval_accuracy": 0.556,
"eval_loss": 0.7961764931678772,
"eval_runtime": 11.3396,
"eval_samples_per_second": 22.047,
"eval_steps_per_second": 2.822,
"step": 19
},
{
"epoch": 0.32,
"grad_norm": 18.774343490600586,
"learning_rate": 4.467213114754098e-05,
"loss": 0.617,
"step": 20
},
{
"epoch": 0.32,
"eval_accuracy": 0.516,
"eval_loss": 0.807204008102417,
"eval_runtime": 11.3337,
"eval_samples_per_second": 22.058,
"eval_steps_per_second": 2.823,
"step": 20
},
{
"epoch": 0.336,
"grad_norm": 24.550052642822266,
"learning_rate": 4.426229508196721e-05,
"loss": 0.6701,
"step": 21
},
{
"epoch": 0.336,
"eval_accuracy": 0.552,
"eval_loss": 0.7810255289077759,
"eval_runtime": 11.3902,
"eval_samples_per_second": 21.949,
"eval_steps_per_second": 2.809,
"step": 21
},
{
"epoch": 0.352,
"grad_norm": 28.407976150512695,
"learning_rate": 4.3852459016393444e-05,
"loss": 0.823,
"step": 22
},
{
"epoch": 0.352,
"eval_accuracy": 0.552,
"eval_loss": 0.764638364315033,
"eval_runtime": 11.3671,
"eval_samples_per_second": 21.993,
"eval_steps_per_second": 2.815,
"step": 22
},
{
"epoch": 0.368,
"grad_norm": 31.69023895263672,
"learning_rate": 4.3442622950819674e-05,
"loss": 0.7332,
"step": 23
},
{
"epoch": 0.368,
"eval_accuracy": 0.6,
"eval_loss": 0.7719610333442688,
"eval_runtime": 11.3313,
"eval_samples_per_second": 22.063,
"eval_steps_per_second": 2.824,
"step": 23
},
{
"epoch": 0.384,
"grad_norm": 51.598724365234375,
"learning_rate": 4.3032786885245904e-05,
"loss": 1.0789,
"step": 24
},
{
"epoch": 0.384,
"eval_accuracy": 0.604,
"eval_loss": 0.792045533657074,
"eval_runtime": 11.3212,
"eval_samples_per_second": 22.082,
"eval_steps_per_second": 2.827,
"step": 24
},
{
"epoch": 0.4,
"grad_norm": 8.370189666748047,
"learning_rate": 4.262295081967213e-05,
"loss": 0.5899,
"step": 25
},
{
"epoch": 0.4,
"eval_accuracy": 0.588,
"eval_loss": 0.8152350187301636,
"eval_runtime": 11.2787,
"eval_samples_per_second": 22.166,
"eval_steps_per_second": 2.837,
"step": 25
},
{
"epoch": 0.416,
"grad_norm": 8.866107940673828,
"learning_rate": 4.2213114754098365e-05,
"loss": 0.6057,
"step": 26
},
{
"epoch": 0.416,
"eval_accuracy": 0.604,
"eval_loss": 0.8338910937309265,
"eval_runtime": 11.3283,
"eval_samples_per_second": 22.069,
"eval_steps_per_second": 2.825,
"step": 26
},
{
"epoch": 0.432,
"grad_norm": 32.09278106689453,
"learning_rate": 4.1803278688524595e-05,
"loss": 0.7418,
"step": 27
},
{
"epoch": 0.432,
"eval_accuracy": 0.616,
"eval_loss": 0.8316658735275269,
"eval_runtime": 11.3272,
"eval_samples_per_second": 22.071,
"eval_steps_per_second": 2.825,
"step": 27
},
{
"epoch": 0.448,
"grad_norm": 22.857614517211914,
"learning_rate": 4.1393442622950826e-05,
"loss": 0.8383,
"step": 28
},
{
"epoch": 0.448,
"eval_accuracy": 0.616,
"eval_loss": 0.7918907999992371,
"eval_runtime": 11.3212,
"eval_samples_per_second": 22.082,
"eval_steps_per_second": 2.827,
"step": 28
},
{
"epoch": 0.464,
"grad_norm": 22.1180362701416,
"learning_rate": 4.098360655737705e-05,
"loss": 0.8923,
"step": 29
},
{
"epoch": 0.464,
"eval_accuracy": 0.62,
"eval_loss": 0.7275803685188293,
"eval_runtime": 11.3478,
"eval_samples_per_second": 22.031,
"eval_steps_per_second": 2.82,
"step": 29
},
{
"epoch": 0.48,
"grad_norm": 43.09483337402344,
"learning_rate": 4.057377049180328e-05,
"loss": 0.9467,
"step": 30
},
{
"epoch": 0.48,
"eval_accuracy": 0.616,
"eval_loss": 0.6892617344856262,
"eval_runtime": 11.348,
"eval_samples_per_second": 22.03,
"eval_steps_per_second": 2.82,
"step": 30
},
{
"epoch": 0.496,
"grad_norm": 8.46947956085205,
"learning_rate": 4.016393442622951e-05,
"loss": 0.7778,
"step": 31
},
{
"epoch": 0.496,
"eval_accuracy": 0.612,
"eval_loss": 0.6953690052032471,
"eval_runtime": 11.3372,
"eval_samples_per_second": 22.051,
"eval_steps_per_second": 2.823,
"step": 31
},
{
"epoch": 0.512,
"grad_norm": 33.367454528808594,
"learning_rate": 3.975409836065574e-05,
"loss": 0.6468,
"step": 32
},
{
"epoch": 0.512,
"eval_accuracy": 0.564,
"eval_loss": 0.6975896954536438,
"eval_runtime": 11.3103,
"eval_samples_per_second": 22.104,
"eval_steps_per_second": 2.829,
"step": 32
},
{
"epoch": 0.528,
"grad_norm": 14.80160903930664,
"learning_rate": 3.934426229508197e-05,
"loss": 0.7333,
"step": 33
},
{
"epoch": 0.528,
"eval_accuracy": 0.58,
"eval_loss": 0.6965731978416443,
"eval_runtime": 11.3519,
"eval_samples_per_second": 22.023,
"eval_steps_per_second": 2.819,
"step": 33
},
{
"epoch": 0.544,
"grad_norm": 38.160823822021484,
"learning_rate": 3.89344262295082e-05,
"loss": 0.6591,
"step": 34
},
{
"epoch": 0.544,
"eval_accuracy": 0.588,
"eval_loss": 0.6874374747276306,
"eval_runtime": 11.3235,
"eval_samples_per_second": 22.078,
"eval_steps_per_second": 2.826,
"step": 34
},
{
"epoch": 0.56,
"grad_norm": 33.589561462402344,
"learning_rate": 3.8524590163934424e-05,
"loss": 0.7186,
"step": 35
},
{
"epoch": 0.56,
"eval_accuracy": 0.608,
"eval_loss": 0.6751595139503479,
"eval_runtime": 11.3159,
"eval_samples_per_second": 22.093,
"eval_steps_per_second": 2.828,
"step": 35
},
{
"epoch": 0.576,
"grad_norm": 12.282697677612305,
"learning_rate": 3.8114754098360655e-05,
"loss": 0.4988,
"step": 36
},
{
"epoch": 0.576,
"eval_accuracy": 0.616,
"eval_loss": 0.6890735030174255,
"eval_runtime": 11.3573,
"eval_samples_per_second": 22.012,
"eval_steps_per_second": 2.818,
"step": 36
},
{
"epoch": 0.592,
"grad_norm": 15.36685562133789,
"learning_rate": 3.7704918032786885e-05,
"loss": 0.8962,
"step": 37
},
{
"epoch": 0.592,
"eval_accuracy": 0.576,
"eval_loss": 0.7174173593521118,
"eval_runtime": 11.3449,
"eval_samples_per_second": 22.036,
"eval_steps_per_second": 2.821,
"step": 37
},
{
"epoch": 0.608,
"grad_norm": 37.992069244384766,
"learning_rate": 3.729508196721312e-05,
"loss": 0.7407,
"step": 38
},
{
"epoch": 0.608,
"eval_accuracy": 0.528,
"eval_loss": 0.7468773126602173,
"eval_runtime": 11.3013,
"eval_samples_per_second": 22.121,
"eval_steps_per_second": 2.832,
"step": 38
},
{
"epoch": 0.624,
"grad_norm": 43.514469146728516,
"learning_rate": 3.6885245901639346e-05,
"loss": 0.7984,
"step": 39
},
{
"epoch": 0.624,
"eval_accuracy": 0.536,
"eval_loss": 0.7447397708892822,
"eval_runtime": 11.3203,
"eval_samples_per_second": 22.084,
"eval_steps_per_second": 2.827,
"step": 39
},
{
"epoch": 0.64,
"grad_norm": 43.61343765258789,
"learning_rate": 3.6475409836065576e-05,
"loss": 0.6023,
"step": 40
},
{
"epoch": 0.64,
"eval_accuracy": 0.564,
"eval_loss": 0.7195525765419006,
"eval_runtime": 11.3049,
"eval_samples_per_second": 22.114,
"eval_steps_per_second": 2.831,
"step": 40
},
{
"epoch": 0.656,
"grad_norm": 59.5920295715332,
"learning_rate": 3.6065573770491806e-05,
"loss": 0.7771,
"step": 41
},
{
"epoch": 0.656,
"eval_accuracy": 0.604,
"eval_loss": 0.686376690864563,
"eval_runtime": 11.276,
"eval_samples_per_second": 22.171,
"eval_steps_per_second": 2.838,
"step": 41
},
{
"epoch": 0.672,
"grad_norm": 32.0897216796875,
"learning_rate": 3.5655737704918037e-05,
"loss": 0.6586,
"step": 42
},
{
"epoch": 0.672,
"eval_accuracy": 0.62,
"eval_loss": 0.6858681440353394,
"eval_runtime": 11.2967,
"eval_samples_per_second": 22.13,
"eval_steps_per_second": 2.833,
"step": 42
},
{
"epoch": 0.688,
"grad_norm": 7.754942893981934,
"learning_rate": 3.524590163934427e-05,
"loss": 0.5622,
"step": 43
},
{
"epoch": 0.688,
"eval_accuracy": 0.616,
"eval_loss": 0.7191852331161499,
"eval_runtime": 11.3359,
"eval_samples_per_second": 22.054,
"eval_steps_per_second": 2.823,
"step": 43
},
{
"epoch": 0.704,
"grad_norm": 13.65731143951416,
"learning_rate": 3.483606557377049e-05,
"loss": 0.6567,
"step": 44
},
{
"epoch": 0.704,
"eval_accuracy": 0.604,
"eval_loss": 0.7964421510696411,
"eval_runtime": 11.3043,
"eval_samples_per_second": 22.116,
"eval_steps_per_second": 2.831,
"step": 44
},
{
"epoch": 0.72,
"grad_norm": 13.19625473022461,
"learning_rate": 3.442622950819672e-05,
"loss": 0.637,
"step": 45
},
{
"epoch": 0.72,
"eval_accuracy": 0.612,
"eval_loss": 0.8056248426437378,
"eval_runtime": 11.331,
"eval_samples_per_second": 22.063,
"eval_steps_per_second": 2.824,
"step": 45
},
{
"epoch": 0.736,
"grad_norm": 10.382162094116211,
"learning_rate": 3.401639344262295e-05,
"loss": 0.5964,
"step": 46
},
{
"epoch": 0.736,
"eval_accuracy": 0.62,
"eval_loss": 0.8342519402503967,
"eval_runtime": 11.3384,
"eval_samples_per_second": 22.049,
"eval_steps_per_second": 2.822,
"step": 46
},
{
"epoch": 0.752,
"grad_norm": 31.208406448364258,
"learning_rate": 3.360655737704918e-05,
"loss": 0.9646,
"step": 47
},
{
"epoch": 0.752,
"eval_accuracy": 0.612,
"eval_loss": 0.7941861748695374,
"eval_runtime": 11.3217,
"eval_samples_per_second": 22.081,
"eval_steps_per_second": 2.826,
"step": 47
},
{
"epoch": 0.768,
"grad_norm": 28.09980583190918,
"learning_rate": 3.319672131147541e-05,
"loss": 0.778,
"step": 48
},
{
"epoch": 0.768,
"eval_accuracy": 0.62,
"eval_loss": 0.727009117603302,
"eval_runtime": 11.3124,
"eval_samples_per_second": 22.1,
"eval_steps_per_second": 2.829,
"step": 48
},
{
"epoch": 0.784,
"grad_norm": 26.889928817749023,
"learning_rate": 3.2786885245901635e-05,
"loss": 0.8173,
"step": 49
},
{
"epoch": 0.784,
"eval_accuracy": 0.62,
"eval_loss": 0.6923478841781616,
"eval_runtime": 11.3715,
"eval_samples_per_second": 21.985,
"eval_steps_per_second": 2.814,
"step": 49
},
{
"epoch": 0.8,
"grad_norm": 12.534849166870117,
"learning_rate": 3.237704918032787e-05,
"loss": 0.6164,
"step": 50
},
{
"epoch": 0.8,
"eval_accuracy": 0.66,
"eval_loss": 0.6402404308319092,
"eval_runtime": 11.6238,
"eval_samples_per_second": 21.508,
"eval_steps_per_second": 2.753,
"step": 50
},
{
"epoch": 0.816,
"grad_norm": 14.15957260131836,
"learning_rate": 3.19672131147541e-05,
"loss": 0.6124,
"step": 51
},
{
"epoch": 0.816,
"eval_accuracy": 0.648,
"eval_loss": 0.6378893852233887,
"eval_runtime": 11.4856,
"eval_samples_per_second": 21.766,
"eval_steps_per_second": 2.786,
"step": 51
},
{
"epoch": 0.832,
"grad_norm": 19.737197875976562,
"learning_rate": 3.155737704918033e-05,
"loss": 0.6773,
"step": 52
},
{
"epoch": 0.832,
"eval_accuracy": 0.584,
"eval_loss": 0.6686127781867981,
"eval_runtime": 11.4137,
"eval_samples_per_second": 21.903,
"eval_steps_per_second": 2.804,
"step": 52
},
{
"epoch": 0.848,
"grad_norm": 19.23349952697754,
"learning_rate": 3.114754098360656e-05,
"loss": 0.6336,
"step": 53
},
{
"epoch": 0.848,
"eval_accuracy": 0.608,
"eval_loss": 0.6502300500869751,
"eval_runtime": 11.6377,
"eval_samples_per_second": 21.482,
"eval_steps_per_second": 2.75,
"step": 53
},
{
"epoch": 0.864,
"grad_norm": 40.20008087158203,
"learning_rate": 3.073770491803279e-05,
"loss": 0.7077,
"step": 54
},
{
"epoch": 0.864,
"eval_accuracy": 0.62,
"eval_loss": 0.6335379481315613,
"eval_runtime": 11.4293,
"eval_samples_per_second": 21.874,
"eval_steps_per_second": 2.8,
"step": 54
},
{
"epoch": 0.88,
"grad_norm": 9.706358909606934,
"learning_rate": 3.0327868852459017e-05,
"loss": 0.4935,
"step": 55
},
{
"epoch": 0.88,
"eval_accuracy": 0.616,
"eval_loss": 0.640767514705658,
"eval_runtime": 11.367,
"eval_samples_per_second": 21.993,
"eval_steps_per_second": 2.815,
"step": 55
},
{
"epoch": 0.896,
"grad_norm": 7.35679817199707,
"learning_rate": 2.9918032786885248e-05,
"loss": 0.4311,
"step": 56
},
{
"epoch": 0.896,
"eval_accuracy": 0.604,
"eval_loss": 0.6300995349884033,
"eval_runtime": 11.4719,
"eval_samples_per_second": 21.792,
"eval_steps_per_second": 2.789,
"step": 56
},
{
"epoch": 0.912,
"grad_norm": 9.2598876953125,
"learning_rate": 2.9508196721311478e-05,
"loss": 0.4558,
"step": 57
},
{
"epoch": 0.912,
"eval_accuracy": 0.664,
"eval_loss": 0.630566418170929,
"eval_runtime": 11.5188,
"eval_samples_per_second": 21.704,
"eval_steps_per_second": 2.778,
"step": 57
},
{
"epoch": 0.928,
"grad_norm": 31.75694465637207,
"learning_rate": 2.9098360655737705e-05,
"loss": 0.6486,
"step": 58
},
{
"epoch": 0.928,
"eval_accuracy": 0.64,
"eval_loss": 0.614264726638794,
"eval_runtime": 11.6451,
"eval_samples_per_second": 21.468,
"eval_steps_per_second": 2.748,
"step": 58
},
{
"epoch": 0.944,
"grad_norm": 39.16770553588867,
"learning_rate": 2.8688524590163935e-05,
"loss": 0.6755,
"step": 59
},
{
"epoch": 0.944,
"eval_accuracy": 0.668,
"eval_loss": 0.5880586504936218,
"eval_runtime": 11.5144,
"eval_samples_per_second": 21.712,
"eval_steps_per_second": 2.779,
"step": 59
},
{
"epoch": 0.96,
"grad_norm": 17.8769474029541,
"learning_rate": 2.8278688524590162e-05,
"loss": 0.6925,
"step": 60
},
{
"epoch": 0.96,
"eval_accuracy": 0.636,
"eval_loss": 0.5895799398422241,
"eval_runtime": 11.4443,
"eval_samples_per_second": 21.845,
"eval_steps_per_second": 2.796,
"step": 60
},
{
"epoch": 0.976,
"grad_norm": 16.898263931274414,
"learning_rate": 2.7868852459016392e-05,
"loss": 0.3927,
"step": 61
},
{
"epoch": 0.976,
"eval_accuracy": 0.672,
"eval_loss": 0.6116553544998169,
"eval_runtime": 11.4294,
"eval_samples_per_second": 21.873,
"eval_steps_per_second": 2.8,
"step": 61
},
{
"epoch": 0.992,
"grad_norm": 11.950173377990723,
"learning_rate": 2.7459016393442626e-05,
"loss": 0.6678,
"step": 62
},
{
"epoch": 0.992,
"eval_accuracy": 0.676,
"eval_loss": 0.629517138004303,
"eval_runtime": 11.2631,
"eval_samples_per_second": 22.196,
"eval_steps_per_second": 2.841,
"step": 62
},
{
"epoch": 1.008,
"grad_norm": 14.72547435760498,
"learning_rate": 2.7049180327868856e-05,
"loss": 0.4718,
"step": 63
},
{
"epoch": 1.008,
"eval_accuracy": 0.676,
"eval_loss": 0.6391622424125671,
"eval_runtime": 11.4024,
"eval_samples_per_second": 21.925,
"eval_steps_per_second": 2.806,
"step": 63
},
{
"epoch": 1.024,
"grad_norm": 21.045801162719727,
"learning_rate": 2.6639344262295087e-05,
"loss": 0.4525,
"step": 64
},
{
"epoch": 1.024,
"eval_accuracy": 0.676,
"eval_loss": 0.6506518721580505,
"eval_runtime": 11.4065,
"eval_samples_per_second": 21.917,
"eval_steps_per_second": 2.805,
"step": 64
},
{
"epoch": 1.04,
"grad_norm": NaN,
"learning_rate": 2.6639344262295087e-05,
"loss": 0.5411,
"step": 65
},
{
"epoch": 1.04,
"eval_accuracy": 0.676,
"eval_loss": 0.6506518721580505,
"eval_runtime": 11.483,
"eval_samples_per_second": 21.771,
"eval_steps_per_second": 2.787,
"step": 65
},
{
"epoch": 1.056,
"grad_norm": 35.55463409423828,
"learning_rate": 2.6229508196721314e-05,
"loss": 0.5345,
"step": 66
},
{
"epoch": 1.056,
"eval_accuracy": 0.68,
"eval_loss": 0.64942467212677,
"eval_runtime": 11.2992,
"eval_samples_per_second": 22.125,
"eval_steps_per_second": 2.832,
"step": 66
},
{
"epoch": 1.072,
"grad_norm": 36.09001922607422,
"learning_rate": 2.5819672131147544e-05,
"loss": 0.5968,
"step": 67
},
{
"epoch": 1.072,
"eval_accuracy": 0.696,
"eval_loss": 0.6280552744865417,
"eval_runtime": 11.47,
"eval_samples_per_second": 21.796,
"eval_steps_per_second": 2.79,
"step": 67
},
{
"epoch": 1.088,
"grad_norm": 42.95037078857422,
"learning_rate": 2.540983606557377e-05,
"loss": 0.7288,
"step": 68
},
{
"epoch": 1.088,
"eval_accuracy": 0.684,
"eval_loss": 0.6100922226905823,
"eval_runtime": 11.395,
"eval_samples_per_second": 21.939,
"eval_steps_per_second": 2.808,
"step": 68
},
{
"epoch": 1.104,
"grad_norm": 18.122161865234375,
"learning_rate": 2.5e-05,
"loss": 0.3666,
"step": 69
},
{
"epoch": 1.104,
"eval_accuracy": 0.708,
"eval_loss": 0.5811479687690735,
"eval_runtime": 11.467,
"eval_samples_per_second": 21.802,
"eval_steps_per_second": 2.791,
"step": 69
},
{
"epoch": 1.12,
"grad_norm": 17.941131591796875,
"learning_rate": 2.459016393442623e-05,
"loss": 0.5333,
"step": 70
},
{
"epoch": 1.12,
"eval_accuracy": 0.684,
"eval_loss": 0.5954810976982117,
"eval_runtime": 11.4815,
"eval_samples_per_second": 21.774,
"eval_steps_per_second": 2.787,
"step": 70
},
{
"epoch": 1.1360000000000001,
"grad_norm": 10.752734184265137,
"learning_rate": 2.418032786885246e-05,
"loss": 0.4274,
"step": 71
},
{
"epoch": 1.1360000000000001,
"eval_accuracy": 0.672,
"eval_loss": 0.5998041033744812,
"eval_runtime": 11.5172,
"eval_samples_per_second": 21.707,
"eval_steps_per_second": 2.778,
"step": 71
},
{
"epoch": 1.152,
"grad_norm": 21.44332504272461,
"learning_rate": 2.377049180327869e-05,
"loss": 0.4109,
"step": 72
},
{
"epoch": 1.152,
"eval_accuracy": 0.668,
"eval_loss": 0.6016911864280701,
"eval_runtime": 11.4217,
"eval_samples_per_second": 21.888,
"eval_steps_per_second": 2.802,
"step": 72
},
{
"epoch": 1.168,
"grad_norm": 40.88154220581055,
"learning_rate": 2.336065573770492e-05,
"loss": 0.576,
"step": 73
},
{
"epoch": 1.168,
"eval_accuracy": 0.672,
"eval_loss": 0.6131250262260437,
"eval_runtime": 11.4732,
"eval_samples_per_second": 21.79,
"eval_steps_per_second": 2.789,
"step": 73
},
{
"epoch": 1.184,
"grad_norm": 21.387557983398438,
"learning_rate": 2.295081967213115e-05,
"loss": 0.598,
"step": 74
},
{
"epoch": 1.184,
"eval_accuracy": 0.688,
"eval_loss": 0.5768781900405884,
"eval_runtime": 11.3223,
"eval_samples_per_second": 22.08,
"eval_steps_per_second": 2.826,
"step": 74
},
{
"epoch": 1.2,
"grad_norm": 22.257291793823242,
"learning_rate": 2.254098360655738e-05,
"loss": 0.4916,
"step": 75
},
{
"epoch": 1.2,
"eval_accuracy": 0.704,
"eval_loss": 0.5493154525756836,
"eval_runtime": 11.4048,
"eval_samples_per_second": 21.921,
"eval_steps_per_second": 2.806,
"step": 75
},
{
"epoch": 1.216,
"grad_norm": 8.411641120910645,
"learning_rate": 2.2131147540983607e-05,
"loss": 0.3723,
"step": 76
},
{
"epoch": 1.216,
"eval_accuracy": 0.716,
"eval_loss": 0.5425886511802673,
"eval_runtime": 11.4678,
"eval_samples_per_second": 21.8,
"eval_steps_per_second": 2.79,
"step": 76
},
{
"epoch": 1.232,
"grad_norm": 10.33214282989502,
"learning_rate": 2.1721311475409837e-05,
"loss": 0.5423,
"step": 77
},
{
"epoch": 1.232,
"eval_accuracy": 0.704,
"eval_loss": 0.5367762446403503,
"eval_runtime": 11.5849,
"eval_samples_per_second": 21.58,
"eval_steps_per_second": 2.762,
"step": 77
},
{
"epoch": 1.248,
"grad_norm": 8.413525581359863,
"learning_rate": 2.1311475409836064e-05,
"loss": 0.5154,
"step": 78
},
{
"epoch": 1.248,
"eval_accuracy": 0.728,
"eval_loss": 0.5338938236236572,
"eval_runtime": 11.3935,
"eval_samples_per_second": 21.942,
"eval_steps_per_second": 2.809,
"step": 78
},
{
"epoch": 1.264,
"grad_norm": 29.967487335205078,
"learning_rate": 2.0901639344262298e-05,
"loss": 0.5072,
"step": 79
},
{
"epoch": 1.264,
"eval_accuracy": 0.716,
"eval_loss": 0.5389543175697327,
"eval_runtime": 11.5734,
"eval_samples_per_second": 21.601,
"eval_steps_per_second": 2.765,
"step": 79
},
{
"epoch": 1.28,
"grad_norm": 15.605040550231934,
"learning_rate": 2.0491803278688525e-05,
"loss": 0.38,
"step": 80
},
{
"epoch": 1.28,
"eval_accuracy": 0.728,
"eval_loss": 0.5024056434631348,
"eval_runtime": 11.3479,
"eval_samples_per_second": 22.031,
"eval_steps_per_second": 2.82,
"step": 80
},
{
"epoch": 1.296,
"grad_norm": 35.906517028808594,
"learning_rate": 2.0081967213114755e-05,
"loss": 0.6005,
"step": 81
},
{
"epoch": 1.296,
"eval_accuracy": 0.744,
"eval_loss": 0.49823418259620667,
"eval_runtime": 11.4956,
"eval_samples_per_second": 21.747,
"eval_steps_per_second": 2.784,
"step": 81
},
{
"epoch": 1.312,
"grad_norm": 7.512831211090088,
"learning_rate": 1.9672131147540985e-05,
"loss": 0.5016,
"step": 82
},
{
"epoch": 1.312,
"eval_accuracy": 0.728,
"eval_loss": 0.4885072708129883,
"eval_runtime": 11.4788,
"eval_samples_per_second": 21.779,
"eval_steps_per_second": 2.788,
"step": 82
},
{
"epoch": 1.328,
"grad_norm": 16.890913009643555,
"learning_rate": 1.9262295081967212e-05,
"loss": 0.4624,
"step": 83
},
{
"epoch": 1.328,
"eval_accuracy": 0.736,
"eval_loss": 0.5011359453201294,
"eval_runtime": 11.451,
"eval_samples_per_second": 21.832,
"eval_steps_per_second": 2.795,
"step": 83
},
{
"epoch": 1.3439999999999999,
"grad_norm": 8.635043144226074,
"learning_rate": 1.8852459016393442e-05,
"loss": 0.3138,
"step": 84
},
{
"epoch": 1.3439999999999999,
"eval_accuracy": 0.748,
"eval_loss": 0.5005082488059998,
"eval_runtime": 11.5278,
"eval_samples_per_second": 21.687,
"eval_steps_per_second": 2.776,
"step": 84
},
{
"epoch": 1.3599999999999999,
"grad_norm": 28.464235305786133,
"learning_rate": 1.8442622950819673e-05,
"loss": 0.3379,
"step": 85
},
{
"epoch": 1.3599999999999999,
"eval_accuracy": 0.784,
"eval_loss": 0.49115338921546936,
"eval_runtime": 11.4489,
"eval_samples_per_second": 21.836,
"eval_steps_per_second": 2.795,
"step": 85
},
{
"epoch": 1.376,
"grad_norm": 6.288327217102051,
"learning_rate": 1.8032786885245903e-05,
"loss": 0.2329,
"step": 86
},
{
"epoch": 1.376,
"eval_accuracy": 0.76,
"eval_loss": 0.486227810382843,
"eval_runtime": 11.4903,
"eval_samples_per_second": 21.757,
"eval_steps_per_second": 2.785,
"step": 86
},
{
"epoch": 1.392,
"grad_norm": 9.165458679199219,
"learning_rate": 1.7622950819672133e-05,
"loss": 0.4698,
"step": 87
},
{
"epoch": 1.392,
"eval_accuracy": 0.764,
"eval_loss": 0.49401524662971497,
"eval_runtime": 11.6,
"eval_samples_per_second": 21.552,
"eval_steps_per_second": 2.759,
"step": 87
},
{
"epoch": 1.408,
"grad_norm": 30.95488166809082,
"learning_rate": 1.721311475409836e-05,
"loss": 0.386,
"step": 88
},
{
"epoch": 1.408,
"eval_accuracy": 0.788,
"eval_loss": 0.4993850588798523,
"eval_runtime": 11.4082,
"eval_samples_per_second": 21.914,
"eval_steps_per_second": 2.805,
"step": 88
},
{
"epoch": 1.424,
"grad_norm": 37.84017562866211,
"learning_rate": 1.680327868852459e-05,
"loss": 0.7592,
"step": 89
},
{
"epoch": 1.424,
"eval_accuracy": 0.768,
"eval_loss": 0.4701511263847351,
"eval_runtime": 11.4269,
"eval_samples_per_second": 21.878,
"eval_steps_per_second": 2.8,
"step": 89
},
{
"epoch": 1.44,
"grad_norm": 41.4830436706543,
"learning_rate": 1.6393442622950818e-05,
"loss": 0.5635,
"step": 90
},
{
"epoch": 1.44,
"eval_accuracy": 0.776,
"eval_loss": 0.4597744047641754,
"eval_runtime": 11.3414,
"eval_samples_per_second": 22.043,
"eval_steps_per_second": 2.822,
"step": 90
},
{
"epoch": 1.456,
"grad_norm": 8.639835357666016,
"learning_rate": 1.598360655737705e-05,
"loss": 0.4412,
"step": 91
},
{
"epoch": 1.456,
"eval_accuracy": 0.76,
"eval_loss": 0.44887205958366394,
"eval_runtime": 11.3759,
"eval_samples_per_second": 21.976,
"eval_steps_per_second": 2.813,
"step": 91
},
{
"epoch": 1.472,
"grad_norm": 13.933167457580566,
"learning_rate": 1.557377049180328e-05,
"loss": 0.4016,
"step": 92
},
{
"epoch": 1.472,
"eval_accuracy": 0.764,
"eval_loss": 0.4409584403038025,
"eval_runtime": 11.4154,
"eval_samples_per_second": 21.9,
"eval_steps_per_second": 2.803,
"step": 92
},
{
"epoch": 1.488,
"grad_norm": 25.79916000366211,
"learning_rate": 1.5163934426229509e-05,
"loss": 0.3462,
"step": 93
},
{
"epoch": 1.488,
"eval_accuracy": 0.76,
"eval_loss": 0.4534677267074585,
"eval_runtime": 11.4913,
"eval_samples_per_second": 21.756,
"eval_steps_per_second": 2.785,
"step": 93
},
{
"epoch": 1.504,
"grad_norm": 8.152263641357422,
"learning_rate": 1.4754098360655739e-05,
"loss": 0.2376,
"step": 94
},
{
"epoch": 1.504,
"eval_accuracy": 0.78,
"eval_loss": 0.4606277644634247,
"eval_runtime": 11.3366,
"eval_samples_per_second": 22.053,
"eval_steps_per_second": 2.823,
"step": 94
},
{
"epoch": 1.52,
"grad_norm": 50.013893127441406,
"learning_rate": 1.4344262295081968e-05,
"loss": 0.8243,
"step": 95
},
{
"epoch": 1.52,
"eval_accuracy": 0.76,
"eval_loss": 0.4745258092880249,
"eval_runtime": 11.4641,
"eval_samples_per_second": 21.807,
"eval_steps_per_second": 2.791,
"step": 95
},
{
"epoch": 1.536,
"grad_norm": 7.844508647918701,
"learning_rate": 1.3934426229508196e-05,
"loss": 0.2607,
"step": 96
},
{
"epoch": 1.536,
"eval_accuracy": 0.768,
"eval_loss": 0.46744146943092346,
"eval_runtime": 11.4558,
"eval_samples_per_second": 21.823,
"eval_steps_per_second": 2.793,
"step": 96
},
{
"epoch": 1.552,
"grad_norm": 36.814781188964844,
"learning_rate": 1.3524590163934428e-05,
"loss": 0.6977,
"step": 97
},
{
"epoch": 1.552,
"eval_accuracy": 0.768,
"eval_loss": 0.46450626850128174,
"eval_runtime": 11.6232,
"eval_samples_per_second": 21.509,
"eval_steps_per_second": 2.753,
"step": 97
},
{
"epoch": 1.568,
"grad_norm": 12.111028671264648,
"learning_rate": 1.3114754098360657e-05,
"loss": 0.9507,
"step": 98
},
{
"epoch": 1.568,
"eval_accuracy": 0.78,
"eval_loss": 0.45364972949028015,
"eval_runtime": 11.495,
"eval_samples_per_second": 21.749,
"eval_steps_per_second": 2.784,
"step": 98
},
{
"epoch": 1.584,
"grad_norm": 8.090563774108887,
"learning_rate": 1.2704918032786885e-05,
"loss": 0.376,
"step": 99
},
{
"epoch": 1.584,
"eval_accuracy": 0.78,
"eval_loss": 0.45443812012672424,
"eval_runtime": 11.4699,
"eval_samples_per_second": 21.796,
"eval_steps_per_second": 2.79,
"step": 99
},
{
"epoch": 1.6,
"grad_norm": 27.078815460205078,
"learning_rate": 1.2295081967213116e-05,
"loss": 0.4708,
"step": 100
},
{
"epoch": 1.6,
"eval_accuracy": 0.792,
"eval_loss": 0.43265777826309204,
"eval_runtime": 11.434,
"eval_samples_per_second": 21.865,
"eval_steps_per_second": 2.799,
"step": 100
},
{
"epoch": 1.616,
"grad_norm": 31.743221282958984,
"learning_rate": 1.1885245901639344e-05,
"loss": 0.4244,
"step": 101
},
{
"epoch": 1.616,
"eval_accuracy": 0.8,
"eval_loss": 0.4210461378097534,
"eval_runtime": 11.605,
"eval_samples_per_second": 21.543,
"eval_steps_per_second": 2.757,
"step": 101
},
{
"epoch": 1.6320000000000001,
"grad_norm": 5.887348175048828,
"learning_rate": 1.1475409836065575e-05,
"loss": 0.2502,
"step": 102
},
{
"epoch": 1.6320000000000001,
"eval_accuracy": 0.816,
"eval_loss": 0.40375572443008423,
"eval_runtime": 11.4278,
"eval_samples_per_second": 21.876,
"eval_steps_per_second": 2.8,
"step": 102
},
{
"epoch": 1.6480000000000001,
"grad_norm": 13.274320602416992,
"learning_rate": 1.1065573770491803e-05,
"loss": 0.8468,
"step": 103
},
{
"epoch": 1.6480000000000001,
"eval_accuracy": 0.816,
"eval_loss": 0.39257147908210754,
"eval_runtime": 11.4088,
"eval_samples_per_second": 21.913,
"eval_steps_per_second": 2.805,
"step": 103
},
{
"epoch": 1.6640000000000001,
"grad_norm": 22.40627098083496,
"learning_rate": 1.0655737704918032e-05,
"loss": 0.5753,
"step": 104
},
{
"epoch": 1.6640000000000001,
"eval_accuracy": 0.816,
"eval_loss": 0.39645299315452576,
"eval_runtime": 11.469,
"eval_samples_per_second": 21.798,
"eval_steps_per_second": 2.79,
"step": 104
},
{
"epoch": 1.6800000000000002,
"grad_norm": 7.59094762802124,
"learning_rate": 1.0245901639344262e-05,
"loss": 0.2518,
"step": 105
},
{
"epoch": 1.6800000000000002,
"eval_accuracy": 0.812,
"eval_loss": 0.3954884707927704,
"eval_runtime": 11.452,
"eval_samples_per_second": 21.83,
"eval_steps_per_second": 2.794,
"step": 105
},
{
"epoch": 1.696,
"grad_norm": 10.119157791137695,
"learning_rate": 9.836065573770493e-06,
"loss": 0.4945,
"step": 106
},
{
"epoch": 1.696,
"eval_accuracy": 0.796,
"eval_loss": 0.39326009154319763,
"eval_runtime": 11.4475,
"eval_samples_per_second": 21.839,
"eval_steps_per_second": 2.795,
"step": 106
},
{
"epoch": 1.712,
"grad_norm": 12.016386032104492,
"learning_rate": 9.426229508196721e-06,
"loss": 0.2257,
"step": 107
},
{
"epoch": 1.712,
"eval_accuracy": 0.82,
"eval_loss": 0.38625067472457886,
"eval_runtime": 11.4441,
"eval_samples_per_second": 21.845,
"eval_steps_per_second": 2.796,
"step": 107
},
{
"epoch": 1.728,
"grad_norm": 37.06383514404297,
"learning_rate": 9.016393442622952e-06,
"loss": 0.8669,
"step": 108
},
{
"epoch": 1.728,
"eval_accuracy": 0.804,
"eval_loss": 0.388757586479187,
"eval_runtime": 11.59,
"eval_samples_per_second": 21.57,
"eval_steps_per_second": 2.761,
"step": 108
},
{
"epoch": 1.744,
"grad_norm": 19.463485717773438,
"learning_rate": 8.60655737704918e-06,
"loss": 0.4496,
"step": 109
},
{
"epoch": 1.744,
"eval_accuracy": 0.82,
"eval_loss": 0.3835999667644501,
"eval_runtime": 11.3837,
"eval_samples_per_second": 21.961,
"eval_steps_per_second": 2.811,
"step": 109
},
{
"epoch": 1.76,
"grad_norm": 9.631560325622559,
"learning_rate": 8.196721311475409e-06,
"loss": 0.3514,
"step": 110
},
{
"epoch": 1.76,
"eval_accuracy": 0.804,
"eval_loss": 0.3807854652404785,
"eval_runtime": 11.3909,
"eval_samples_per_second": 21.947,
"eval_steps_per_second": 2.809,
"step": 110
},
{
"epoch": 1.776,
"grad_norm": 15.87090015411377,
"learning_rate": 7.78688524590164e-06,
"loss": 0.2494,
"step": 111
},
{
"epoch": 1.776,
"eval_accuracy": 0.784,
"eval_loss": 0.39890381693840027,
"eval_runtime": 11.3662,
"eval_samples_per_second": 21.995,
"eval_steps_per_second": 2.815,
"step": 111
},
{
"epoch": 1.792,
"grad_norm": 13.411320686340332,
"learning_rate": 7.3770491803278695e-06,
"loss": 0.5852,
"step": 112
},
{
"epoch": 1.792,
"eval_accuracy": 0.812,
"eval_loss": 0.37704914808273315,
"eval_runtime": 11.496,
"eval_samples_per_second": 21.747,
"eval_steps_per_second": 2.784,
"step": 112
},
{
"epoch": 1.808,
"grad_norm": 10.2308931350708,
"learning_rate": 6.967213114754098e-06,
"loss": 0.2353,
"step": 113
},
{
"epoch": 1.808,
"eval_accuracy": 0.804,
"eval_loss": 0.3881246745586395,
"eval_runtime": 11.6252,
"eval_samples_per_second": 21.505,
"eval_steps_per_second": 2.753,
"step": 113
},
{
"epoch": 1.8239999999999998,
"grad_norm": 35.862152099609375,
"learning_rate": 6.557377049180328e-06,
"loss": 0.347,
"step": 114
},
{
"epoch": 1.8239999999999998,
"eval_accuracy": 0.82,
"eval_loss": 0.38348227739334106,
"eval_runtime": 11.3665,
"eval_samples_per_second": 21.995,
"eval_steps_per_second": 2.815,
"step": 114
},
{
"epoch": 1.8399999999999999,
"grad_norm": 16.910600662231445,
"learning_rate": 6.147540983606558e-06,
"loss": 0.5338,
"step": 115
},
{
"epoch": 1.8399999999999999,
"eval_accuracy": 0.788,
"eval_loss": 0.39259979128837585,
"eval_runtime": 11.3572,
"eval_samples_per_second": 22.013,
"eval_steps_per_second": 2.818,
"step": 115
},
{
"epoch": 1.8559999999999999,
"grad_norm": 14.902595520019531,
"learning_rate": 5.737704918032787e-06,
"loss": 0.3718,
"step": 116
},
{
"epoch": 1.8559999999999999,
"eval_accuracy": 0.812,
"eval_loss": 0.37997427582740784,
"eval_runtime": 11.3538,
"eval_samples_per_second": 22.019,
"eval_steps_per_second": 2.818,
"step": 116
},
{
"epoch": 1.8719999999999999,
"grad_norm": 13.893790245056152,
"learning_rate": 5.327868852459016e-06,
"loss": 0.1954,
"step": 117
},
{
"epoch": 1.8719999999999999,
"eval_accuracy": 0.82,
"eval_loss": 0.3953521251678467,
"eval_runtime": 11.6285,
"eval_samples_per_second": 21.499,
"eval_steps_per_second": 2.752,
"step": 117
},
{
"epoch": 1.888,
"grad_norm": 20.8792667388916,
"learning_rate": 4.918032786885246e-06,
"loss": 0.3679,
"step": 118
},
{
"epoch": 1.888,
"eval_accuracy": 0.788,
"eval_loss": 0.4041662812232971,
"eval_runtime": 11.7007,
"eval_samples_per_second": 21.366,
"eval_steps_per_second": 2.735,
"step": 118
},
{
"epoch": 1.904,
"grad_norm": 10.44711685180664,
"learning_rate": 4.508196721311476e-06,
"loss": 0.2971,
"step": 119
},
{
"epoch": 1.904,
"eval_accuracy": 0.796,
"eval_loss": 0.39307668805122375,
"eval_runtime": 11.5223,
"eval_samples_per_second": 21.697,
"eval_steps_per_second": 2.777,
"step": 119
},
{
"epoch": 1.92,
"grad_norm": 17.487539291381836,
"learning_rate": 4.098360655737704e-06,
"loss": 0.2742,
"step": 120
},
{
"epoch": 1.92,
"eval_accuracy": 0.804,
"eval_loss": 0.38854384422302246,
"eval_runtime": 11.3276,
"eval_samples_per_second": 22.07,
"eval_steps_per_second": 2.825,
"step": 120
},
{
"epoch": 1.936,
"grad_norm": 33.492523193359375,
"learning_rate": 3.6885245901639347e-06,
"loss": 0.5166,
"step": 121
},
{
"epoch": 1.936,
"eval_accuracy": 0.776,
"eval_loss": 0.39685389399528503,
"eval_runtime": 11.3571,
"eval_samples_per_second": 22.013,
"eval_steps_per_second": 2.818,
"step": 121
},
{
"epoch": 1.952,
"grad_norm": 16.691682815551758,
"learning_rate": 3.278688524590164e-06,
"loss": 0.2628,
"step": 122
},
{
"epoch": 1.952,
"eval_accuracy": 0.796,
"eval_loss": 0.39687633514404297,
"eval_runtime": 11.3028,
"eval_samples_per_second": 22.118,
"eval_steps_per_second": 2.831,
"step": 122
},
{
"epoch": 1.968,
"grad_norm": 6.603124618530273,
"learning_rate": 2.8688524590163937e-06,
"loss": 0.2998,
"step": 123
},
{
"epoch": 1.968,
"eval_accuracy": 0.824,
"eval_loss": 0.3869865834712982,
"eval_runtime": 11.2907,
"eval_samples_per_second": 22.142,
"eval_steps_per_second": 2.834,
"step": 123
},
{
"epoch": 1.984,
"grad_norm": 20.69419288635254,
"learning_rate": 2.459016393442623e-06,
"loss": 0.3104,
"step": 124
},
{
"epoch": 1.984,
"eval_accuracy": 0.824,
"eval_loss": 0.38347548246383667,
"eval_runtime": 11.3038,
"eval_samples_per_second": 22.117,
"eval_steps_per_second": 2.831,
"step": 124
},
{
"epoch": 1.984,
"step": 124,
"total_flos": 1.6196776411267072e+16,
"train_loss": 0.6168801505719462,
"train_runtime": 1813.1247,
"train_samples_per_second": 1.103,
"train_steps_per_second": 0.068
}
],
"logging_steps": 1,
"max_steps": 124,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 1.6196776411267072e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}