michaeljcliao's picture
Upload folder using huggingface_hub
429eef1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.0,
"eval_steps": 500,
"global_step": 5058,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.18,
"grad_norm": 48.51668930053711,
"learning_rate": 1.98220640569395e-06,
"loss": 10.5512,
"step": 100
},
{
"epoch": 0.36,
"grad_norm": 33.89441680908203,
"learning_rate": 1.9644128113879e-06,
"loss": 6.656,
"step": 200
},
{
"epoch": 0.53,
"grad_norm": 54.47207260131836,
"learning_rate": 1.9466192170818503e-06,
"loss": 4.8936,
"step": 300
},
{
"epoch": 0.71,
"grad_norm": 37.983856201171875,
"learning_rate": 1.9288256227758005e-06,
"loss": 3.5277,
"step": 400
},
{
"epoch": 0.89,
"grad_norm": 23.24921417236328,
"learning_rate": 1.9110320284697506e-06,
"loss": 2.8756,
"step": 500
},
{
"epoch": 1.07,
"grad_norm": 26.32309913635254,
"learning_rate": 1.8932384341637008e-06,
"loss": 2.4437,
"step": 600
},
{
"epoch": 1.25,
"grad_norm": 154.72117614746094,
"learning_rate": 1.8754448398576511e-06,
"loss": 2.2582,
"step": 700
},
{
"epoch": 1.42,
"grad_norm": 16.39977264404297,
"learning_rate": 1.8576512455516013e-06,
"loss": 2.2289,
"step": 800
},
{
"epoch": 1.6,
"grad_norm": 22.533836364746094,
"learning_rate": 1.8398576512455514e-06,
"loss": 2.091,
"step": 900
},
{
"epoch": 1.78,
"grad_norm": 16.523881912231445,
"learning_rate": 1.8220640569395016e-06,
"loss": 2.0129,
"step": 1000
},
{
"epoch": 1.96,
"grad_norm": 69.78620147705078,
"learning_rate": 1.804270462633452e-06,
"loss": 1.926,
"step": 1100
},
{
"epoch": 2.14,
"grad_norm": 21.385257720947266,
"learning_rate": 1.786476868327402e-06,
"loss": 1.8193,
"step": 1200
},
{
"epoch": 2.31,
"grad_norm": 23.061298370361328,
"learning_rate": 1.7686832740213522e-06,
"loss": 1.804,
"step": 1300
},
{
"epoch": 2.49,
"grad_norm": 21.625669479370117,
"learning_rate": 1.7508896797153024e-06,
"loss": 1.7568,
"step": 1400
},
{
"epoch": 2.67,
"grad_norm": 13.98591136932373,
"learning_rate": 1.7330960854092527e-06,
"loss": 1.7313,
"step": 1500
},
{
"epoch": 2.85,
"grad_norm": 18.47169303894043,
"learning_rate": 1.7153024911032029e-06,
"loss": 1.733,
"step": 1600
},
{
"epoch": 3.02,
"grad_norm": 20.671327590942383,
"learning_rate": 1.697508896797153e-06,
"loss": 1.698,
"step": 1700
},
{
"epoch": 3.2,
"grad_norm": 20.78021812438965,
"learning_rate": 1.6797153024911032e-06,
"loss": 1.6192,
"step": 1800
},
{
"epoch": 3.38,
"grad_norm": 35.38755416870117,
"learning_rate": 1.6619217081850533e-06,
"loss": 1.5752,
"step": 1900
},
{
"epoch": 3.56,
"grad_norm": 24.105249404907227,
"learning_rate": 1.6441281138790034e-06,
"loss": 1.5921,
"step": 2000
},
{
"epoch": 3.74,
"grad_norm": 11.530924797058105,
"learning_rate": 1.6263345195729536e-06,
"loss": 1.5497,
"step": 2100
},
{
"epoch": 3.91,
"grad_norm": 17.551040649414062,
"learning_rate": 1.6085409252669037e-06,
"loss": 1.5751,
"step": 2200
},
{
"epoch": 4.09,
"grad_norm": 22.44804573059082,
"learning_rate": 1.590747330960854e-06,
"loss": 1.6072,
"step": 2300
},
{
"epoch": 4.27,
"grad_norm": 14.482297897338867,
"learning_rate": 1.5729537366548042e-06,
"loss": 1.552,
"step": 2400
},
{
"epoch": 4.45,
"grad_norm": 17.7537899017334,
"learning_rate": 1.5551601423487544e-06,
"loss": 1.4403,
"step": 2500
},
{
"epoch": 4.63,
"grad_norm": 23.001920700073242,
"learning_rate": 1.5373665480427045e-06,
"loss": 1.4955,
"step": 2600
},
{
"epoch": 4.8,
"grad_norm": 14.721695899963379,
"learning_rate": 1.5195729537366549e-06,
"loss": 1.4456,
"step": 2700
},
{
"epoch": 4.98,
"grad_norm": 15.371649742126465,
"learning_rate": 1.501779359430605e-06,
"loss": 1.4303,
"step": 2800
},
{
"epoch": 5.16,
"grad_norm": 14.734794616699219,
"learning_rate": 1.4839857651245552e-06,
"loss": 1.4544,
"step": 2900
},
{
"epoch": 5.34,
"grad_norm": 13.686590194702148,
"learning_rate": 1.4661921708185053e-06,
"loss": 1.401,
"step": 3000
},
{
"epoch": 5.52,
"grad_norm": 18.93415641784668,
"learning_rate": 1.4483985765124555e-06,
"loss": 1.4612,
"step": 3100
},
{
"epoch": 5.69,
"grad_norm": 9.70661735534668,
"learning_rate": 1.4306049822064056e-06,
"loss": 1.3558,
"step": 3200
},
{
"epoch": 5.87,
"grad_norm": 16.12574577331543,
"learning_rate": 1.4128113879003557e-06,
"loss": 1.3686,
"step": 3300
},
{
"epoch": 6.05,
"grad_norm": 29.739870071411133,
"learning_rate": 1.3950177935943059e-06,
"loss": 1.3703,
"step": 3400
},
{
"epoch": 6.23,
"grad_norm": 22.152677536010742,
"learning_rate": 1.377224199288256e-06,
"loss": 1.2662,
"step": 3500
},
{
"epoch": 6.41,
"grad_norm": 24.051326751708984,
"learning_rate": 1.3594306049822064e-06,
"loss": 1.35,
"step": 3600
},
{
"epoch": 6.58,
"grad_norm": 11.552955627441406,
"learning_rate": 1.3416370106761565e-06,
"loss": 1.3592,
"step": 3700
},
{
"epoch": 6.76,
"grad_norm": 16.08234977722168,
"learning_rate": 1.3238434163701067e-06,
"loss": 1.3566,
"step": 3800
},
{
"epoch": 6.94,
"grad_norm": 14.58088493347168,
"learning_rate": 1.3060498220640568e-06,
"loss": 1.3257,
"step": 3900
},
{
"epoch": 7.12,
"grad_norm": 12.278518676757812,
"learning_rate": 1.2882562277580072e-06,
"loss": 1.3254,
"step": 4000
},
{
"epoch": 7.3,
"grad_norm": 17.330495834350586,
"learning_rate": 1.2704626334519573e-06,
"loss": 1.2095,
"step": 4100
},
{
"epoch": 7.47,
"grad_norm": 13.842063903808594,
"learning_rate": 1.2526690391459075e-06,
"loss": 1.3475,
"step": 4200
},
{
"epoch": 7.65,
"grad_norm": 13.967167854309082,
"learning_rate": 1.2348754448398574e-06,
"loss": 1.2757,
"step": 4300
},
{
"epoch": 7.83,
"grad_norm": 18.25010871887207,
"learning_rate": 1.2170818505338078e-06,
"loss": 1.2795,
"step": 4400
},
{
"epoch": 8.01,
"grad_norm": 11.46198558807373,
"learning_rate": 1.199288256227758e-06,
"loss": 1.2648,
"step": 4500
},
{
"epoch": 8.19,
"grad_norm": 18.330867767333984,
"learning_rate": 1.181494661921708e-06,
"loss": 1.2345,
"step": 4600
},
{
"epoch": 8.36,
"grad_norm": 27.236454010009766,
"learning_rate": 1.1637010676156582e-06,
"loss": 1.2998,
"step": 4700
},
{
"epoch": 8.54,
"grad_norm": 15.08573055267334,
"learning_rate": 1.1459074733096086e-06,
"loss": 1.2578,
"step": 4800
},
{
"epoch": 8.72,
"grad_norm": 15.639131546020508,
"learning_rate": 1.1281138790035587e-06,
"loss": 1.2062,
"step": 4900
},
{
"epoch": 8.9,
"grad_norm": 22.758560180664062,
"learning_rate": 1.1103202846975088e-06,
"loss": 1.2305,
"step": 5000
}
],
"logging_steps": 100,
"max_steps": 11240,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 1.3591805503085568e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}