rinapch's picture
Training in progress, step 50000, checkpoint
786aecf verified
raw
history blame
No virus
19 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 50000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 1.2623244524002075,
"learning_rate": 0.00019994965423831854,
"loss": 1.4022,
"step": 1000
},
{
"epoch": 0.02,
"eval_loss": 1.4888113737106323,
"eval_runtime": 118.6929,
"eval_samples_per_second": 53.415,
"eval_steps_per_second": 13.354,
"step": 1000
},
{
"epoch": 0.04,
"grad_norm": 1.0886244773864746,
"learning_rate": 0.00019954719225730847,
"loss": 1.39,
"step": 2000
},
{
"epoch": 0.04,
"eval_loss": 1.4779928922653198,
"eval_runtime": 119.2893,
"eval_samples_per_second": 53.148,
"eval_steps_per_second": 13.287,
"step": 2000
},
{
"epoch": 0.06,
"grad_norm": 1.0032905340194702,
"learning_rate": 0.00019874388886763944,
"loss": 1.3591,
"step": 3000
},
{
"epoch": 0.06,
"eval_loss": 1.4647990465164185,
"eval_runtime": 119.1474,
"eval_samples_per_second": 53.211,
"eval_steps_per_second": 13.303,
"step": 3000
},
{
"epoch": 0.08,
"grad_norm": 1.1680448055267334,
"learning_rate": 0.00019754297868854073,
"loss": 1.3373,
"step": 4000
},
{
"epoch": 0.08,
"eval_loss": 1.4628264904022217,
"eval_runtime": 118.5565,
"eval_samples_per_second": 53.477,
"eval_steps_per_second": 13.369,
"step": 4000
},
{
"epoch": 0.1,
"grad_norm": 1.1541577577590942,
"learning_rate": 0.00019594929736144976,
"loss": 1.3124,
"step": 5000
},
{
"epoch": 0.1,
"eval_loss": 1.4525669813156128,
"eval_runtime": 119.2106,
"eval_samples_per_second": 53.183,
"eval_steps_per_second": 13.296,
"step": 5000
},
{
"epoch": 0.12,
"grad_norm": 1.252061367034912,
"learning_rate": 0.00019396926207859084,
"loss": 1.3781,
"step": 6000
},
{
"epoch": 0.12,
"eval_loss": 1.4199129343032837,
"eval_runtime": 119.1684,
"eval_samples_per_second": 53.202,
"eval_steps_per_second": 13.301,
"step": 6000
},
{
"epoch": 0.14,
"grad_norm": 1.1096590757369995,
"learning_rate": 0.00019161084574320696,
"loss": 1.4167,
"step": 7000
},
{
"epoch": 0.14,
"eval_loss": 1.399798035621643,
"eval_runtime": 117.7451,
"eval_samples_per_second": 53.845,
"eval_steps_per_second": 13.461,
"step": 7000
},
{
"epoch": 0.16,
"grad_norm": 1.014760971069336,
"learning_rate": 0.00018888354486549237,
"loss": 1.4106,
"step": 8000
},
{
"epoch": 0.16,
"eval_loss": 1.3875452280044556,
"eval_runtime": 117.8417,
"eval_samples_per_second": 53.801,
"eval_steps_per_second": 13.45,
"step": 8000
},
{
"epoch": 0.18,
"grad_norm": 0.991398274898529,
"learning_rate": 0.00018579834132349772,
"loss": 1.3985,
"step": 9000
},
{
"epoch": 0.18,
"eval_loss": 1.3735474348068237,
"eval_runtime": 118.1423,
"eval_samples_per_second": 53.664,
"eval_steps_per_second": 13.416,
"step": 9000
},
{
"epoch": 0.2,
"grad_norm": 0.9979888796806335,
"learning_rate": 0.0001823676581429833,
"loss": 1.3924,
"step": 10000
},
{
"epoch": 0.2,
"eval_loss": 1.3612616062164307,
"eval_runtime": 119.0319,
"eval_samples_per_second": 53.263,
"eval_steps_per_second": 13.316,
"step": 10000
},
{
"epoch": 0.22,
"grad_norm": 1.1004694700241089,
"learning_rate": 0.00017860530947427875,
"loss": 1.3758,
"step": 11000
},
{
"epoch": 0.22,
"eval_loss": 1.3603907823562622,
"eval_runtime": 119.1776,
"eval_samples_per_second": 53.198,
"eval_steps_per_second": 13.299,
"step": 11000
},
{
"epoch": 0.24,
"grad_norm": 1.1753897666931152,
"learning_rate": 0.0001745264449675755,
"loss": 1.3609,
"step": 12000
},
{
"epoch": 0.24,
"eval_loss": 1.3465052843093872,
"eval_runtime": 118.067,
"eval_samples_per_second": 53.698,
"eval_steps_per_second": 13.425,
"step": 12000
},
{
"epoch": 0.26,
"grad_norm": 1.2619237899780273,
"learning_rate": 0.00017014748877063214,
"loss": 1.344,
"step": 13000
},
{
"epoch": 0.26,
"eval_loss": 1.3324896097183228,
"eval_runtime": 118.0485,
"eval_samples_per_second": 53.707,
"eval_steps_per_second": 13.427,
"step": 13000
},
{
"epoch": 0.28,
"grad_norm": 0.9625117778778076,
"learning_rate": 0.00016548607339452853,
"loss": 1.3335,
"step": 14000
},
{
"epoch": 0.28,
"eval_loss": 1.323933482170105,
"eval_runtime": 117.505,
"eval_samples_per_second": 53.955,
"eval_steps_per_second": 13.489,
"step": 14000
},
{
"epoch": 0.3,
"grad_norm": 1.3606470823287964,
"learning_rate": 0.00016056096871376667,
"loss": 1.3241,
"step": 15000
},
{
"epoch": 0.3,
"eval_loss": 1.314150333404541,
"eval_runtime": 117.8746,
"eval_samples_per_second": 53.786,
"eval_steps_per_second": 13.446,
"step": 15000
},
{
"epoch": 0.32,
"grad_norm": 0.9183652400970459,
"learning_rate": 0.00015539200638661104,
"loss": 1.3029,
"step": 16000
},
{
"epoch": 0.32,
"eval_loss": 1.3047397136688232,
"eval_runtime": 119.2254,
"eval_samples_per_second": 53.177,
"eval_steps_per_second": 13.294,
"step": 16000
},
{
"epoch": 0.34,
"grad_norm": 0.9226890206336975,
"learning_rate": 0.00015000000000000001,
"loss": 1.3005,
"step": 17000
},
{
"epoch": 0.34,
"eval_loss": 1.297969102859497,
"eval_runtime": 119.8633,
"eval_samples_per_second": 52.894,
"eval_steps_per_second": 13.223,
"step": 17000
},
{
"epoch": 0.36,
"grad_norm": 0.9322838187217712,
"learning_rate": 0.00014440666126057744,
"loss": 1.2951,
"step": 18000
},
{
"epoch": 0.36,
"eval_loss": 1.2912102937698364,
"eval_runtime": 119.9122,
"eval_samples_per_second": 52.872,
"eval_steps_per_second": 13.218,
"step": 18000
},
{
"epoch": 0.38,
"grad_norm": 1.102362871170044,
"learning_rate": 0.00013863451256931287,
"loss": 1.313,
"step": 19000
},
{
"epoch": 0.38,
"eval_loss": 1.282011866569519,
"eval_runtime": 117.9359,
"eval_samples_per_second": 53.758,
"eval_steps_per_second": 13.44,
"step": 19000
},
{
"epoch": 0.4,
"grad_norm": 1.0014485120773315,
"learning_rate": 0.00013270679633174218,
"loss": 1.2773,
"step": 20000
},
{
"epoch": 0.4,
"eval_loss": 1.2778606414794922,
"eval_runtime": 120.0646,
"eval_samples_per_second": 52.805,
"eval_steps_per_second": 13.201,
"step": 20000
},
{
"epoch": 0.42,
"grad_norm": 0.8490633964538574,
"learning_rate": 0.00012664738136900348,
"loss": 1.2734,
"step": 21000
},
{
"epoch": 0.42,
"eval_loss": 1.2667981386184692,
"eval_runtime": 117.9405,
"eval_samples_per_second": 53.756,
"eval_steps_per_second": 13.439,
"step": 21000
},
{
"epoch": 0.44,
"grad_norm": 0.9456785321235657,
"learning_rate": 0.00012048066680651908,
"loss": 1.2656,
"step": 22000
},
{
"epoch": 0.44,
"eval_loss": 1.258193850517273,
"eval_runtime": 121.0875,
"eval_samples_per_second": 52.359,
"eval_steps_per_second": 13.09,
"step": 22000
},
{
"epoch": 0.46,
"grad_norm": 1.1480209827423096,
"learning_rate": 0.00011423148382732853,
"loss": 1.2522,
"step": 23000
},
{
"epoch": 0.46,
"eval_loss": 1.2538591623306274,
"eval_runtime": 118.1025,
"eval_samples_per_second": 53.682,
"eval_steps_per_second": 13.421,
"step": 23000
},
{
"epoch": 0.48,
"grad_norm": 0.8957574963569641,
"learning_rate": 0.00010792499568567884,
"loss": 1.2519,
"step": 24000
},
{
"epoch": 0.48,
"eval_loss": 1.2467232942581177,
"eval_runtime": 118.0714,
"eval_samples_per_second": 53.696,
"eval_steps_per_second": 13.424,
"step": 24000
},
{
"epoch": 0.5,
"grad_norm": 1.2852896451950073,
"learning_rate": 0.00010158659638348081,
"loss": 1.24,
"step": 25000
},
{
"epoch": 0.5,
"eval_loss": 1.2399760484695435,
"eval_runtime": 118.556,
"eval_samples_per_second": 53.477,
"eval_steps_per_second": 13.369,
"step": 25000
},
{
"epoch": 0.52,
"grad_norm": 0.9100021719932556,
"learning_rate": 9.524180841762577e-05,
"loss": 1.2653,
"step": 26000
},
{
"epoch": 0.52,
"eval_loss": 1.2347520589828491,
"eval_runtime": 118.0581,
"eval_samples_per_second": 53.702,
"eval_steps_per_second": 13.426,
"step": 26000
},
{
"epoch": 0.54,
"grad_norm": 1.3319469690322876,
"learning_rate": 8.891618000989891e-05,
"loss": 1.2313,
"step": 27000
},
{
"epoch": 0.54,
"eval_loss": 1.2284266948699951,
"eval_runtime": 118.5886,
"eval_samples_per_second": 53.462,
"eval_steps_per_second": 13.366,
"step": 27000
},
{
"epoch": 0.56,
"grad_norm": 1.1901592016220093,
"learning_rate": 8.263518223330697e-05,
"loss": 1.2218,
"step": 28000
},
{
"epoch": 0.56,
"eval_loss": 1.2233468294143677,
"eval_runtime": 118.009,
"eval_samples_per_second": 53.725,
"eval_steps_per_second": 13.431,
"step": 28000
},
{
"epoch": 0.58,
"grad_norm": 1.026314616203308,
"learning_rate": 7.642410644905726e-05,
"loss": 1.2275,
"step": 29000
},
{
"epoch": 0.58,
"eval_loss": 1.2184290885925293,
"eval_runtime": 119.1308,
"eval_samples_per_second": 53.219,
"eval_steps_per_second": 13.305,
"step": 29000
},
{
"epoch": 0.6,
"grad_norm": 1.3237501382827759,
"learning_rate": 7.030796246717255e-05,
"loss": 1.2395,
"step": 30000
},
{
"epoch": 0.6,
"eval_loss": 1.213285207748413,
"eval_runtime": 118.8571,
"eval_samples_per_second": 53.341,
"eval_steps_per_second": 13.335,
"step": 30000
},
{
"epoch": 0.62,
"grad_norm": 1.1643166542053223,
"learning_rate": 6.431137784081282e-05,
"loss": 1.2064,
"step": 31000
},
{
"epoch": 0.62,
"eval_loss": 1.2104227542877197,
"eval_runtime": 120.4291,
"eval_samples_per_second": 52.645,
"eval_steps_per_second": 13.161,
"step": 31000
},
{
"epoch": 0.64,
"grad_norm": 0.7978019118309021,
"learning_rate": 5.845849869981137e-05,
"loss": 1.2141,
"step": 32000
},
{
"epoch": 0.64,
"eval_loss": 1.2048578262329102,
"eval_runtime": 119.224,
"eval_samples_per_second": 53.177,
"eval_steps_per_second": 13.294,
"step": 32000
},
{
"epoch": 0.66,
"grad_norm": 1.1302543878555298,
"learning_rate": 5.277289252273174e-05,
"loss": 1.2054,
"step": 33000
},
{
"epoch": 0.66,
"eval_loss": 1.2011561393737793,
"eval_runtime": 118.3268,
"eval_samples_per_second": 53.58,
"eval_steps_per_second": 13.395,
"step": 33000
},
{
"epoch": 0.68,
"grad_norm": 1.2881643772125244,
"learning_rate": 4.727745323894976e-05,
"loss": 1.2136,
"step": 34000
},
{
"epoch": 0.68,
"eval_loss": 1.1976003646850586,
"eval_runtime": 118.2934,
"eval_samples_per_second": 53.596,
"eval_steps_per_second": 13.399,
"step": 34000
},
{
"epoch": 0.7,
"grad_norm": 0.8197779655456543,
"learning_rate": 4.19943090428802e-05,
"loss": 1.1883,
"step": 35000
},
{
"epoch": 0.7,
"eval_loss": 1.1930813789367676,
"eval_runtime": 117.6868,
"eval_samples_per_second": 53.872,
"eval_steps_per_second": 13.468,
"step": 35000
},
{
"epoch": 0.72,
"grad_norm": 0.9713582992553711,
"learning_rate": 3.694473329154778e-05,
"loss": 1.2058,
"step": 36000
},
{
"epoch": 0.72,
"eval_loss": 1.1900451183319092,
"eval_runtime": 119.439,
"eval_samples_per_second": 53.081,
"eval_steps_per_second": 13.27,
"step": 36000
},
{
"epoch": 0.74,
"grad_norm": 1.4913629293441772,
"learning_rate": 3.21490588442868e-05,
"loss": 1.1864,
"step": 37000
},
{
"epoch": 0.74,
"eval_loss": 1.1863234043121338,
"eval_runtime": 117.5219,
"eval_samples_per_second": 53.947,
"eval_steps_per_second": 13.487,
"step": 37000
},
{
"epoch": 0.76,
"grad_norm": 1.2373270988464355,
"learning_rate": 2.7626596189492983e-05,
"loss": 1.1854,
"step": 38000
},
{
"epoch": 0.76,
"eval_loss": 1.1844661235809326,
"eval_runtime": 117.3667,
"eval_samples_per_second": 54.019,
"eval_steps_per_second": 13.505,
"step": 38000
},
{
"epoch": 0.78,
"grad_norm": 1.1346243619918823,
"learning_rate": 2.339555568810221e-05,
"loss": 1.1954,
"step": 39000
},
{
"epoch": 0.78,
"eval_loss": 1.1816102266311646,
"eval_runtime": 118.4828,
"eval_samples_per_second": 53.51,
"eval_steps_per_second": 13.377,
"step": 39000
},
{
"epoch": 0.8,
"grad_norm": 1.3110140562057495,
"learning_rate": 1.947297424689414e-05,
"loss": 1.1663,
"step": 40000
},
{
"epoch": 0.8,
"eval_loss": 1.178844690322876,
"eval_runtime": 118.8225,
"eval_samples_per_second": 53.357,
"eval_steps_per_second": 13.339,
"step": 40000
},
{
"epoch": 0.82,
"grad_norm": 1.2288557291030884,
"learning_rate": 1.587464671688187e-05,
"loss": 1.1912,
"step": 41000
},
{
"epoch": 0.82,
"eval_loss": 1.177311897277832,
"eval_runtime": 119.099,
"eval_samples_per_second": 53.233,
"eval_steps_per_second": 13.308,
"step": 41000
},
{
"epoch": 0.84,
"grad_norm": 1.0926482677459717,
"learning_rate": 1.2615062293021507e-05,
"loss": 1.1855,
"step": 42000
},
{
"epoch": 0.84,
"eval_loss": 1.1756287813186646,
"eval_runtime": 118.243,
"eval_samples_per_second": 53.618,
"eval_steps_per_second": 13.405,
"step": 42000
},
{
"epoch": 0.86,
"grad_norm": 1.0959358215332031,
"learning_rate": 9.707346171337894e-06,
"loss": 1.1773,
"step": 43000
},
{
"epoch": 0.86,
"eval_loss": 1.1744451522827148,
"eval_runtime": 118.507,
"eval_samples_per_second": 53.499,
"eval_steps_per_second": 13.375,
"step": 43000
},
{
"epoch": 0.88,
"grad_norm": 0.8658304810523987,
"learning_rate": 7.163206698392744e-06,
"loss": 1.1874,
"step": 44000
},
{
"epoch": 0.88,
"eval_loss": 1.1730809211730957,
"eval_runtime": 118.6435,
"eval_samples_per_second": 53.437,
"eval_steps_per_second": 13.359,
"step": 44000
},
{
"epoch": 0.9,
"grad_norm": 1.18503737449646,
"learning_rate": 4.992888225905468e-06,
"loss": 1.1679,
"step": 45000
},
{
"epoch": 0.9,
"eval_loss": 1.171962857246399,
"eval_runtime": 119.251,
"eval_samples_per_second": 53.165,
"eval_steps_per_second": 13.291,
"step": 45000
},
{
"epoch": 0.92,
"grad_norm": 1.209384560585022,
"learning_rate": 3.2051298603643753e-06,
"loss": 1.1776,
"step": 46000
},
{
"epoch": 0.92,
"eval_loss": 1.1715712547302246,
"eval_runtime": 120.3589,
"eval_samples_per_second": 52.676,
"eval_steps_per_second": 13.169,
"step": 46000
},
{
"epoch": 0.94,
"grad_norm": 1.157520055770874,
"learning_rate": 1.8071302737293295e-06,
"loss": 1.1708,
"step": 47000
},
{
"epoch": 0.94,
"eval_loss": 1.1712528467178345,
"eval_runtime": 118.7182,
"eval_samples_per_second": 53.404,
"eval_steps_per_second": 13.351,
"step": 47000
},
{
"epoch": 0.96,
"grad_norm": 1.0842524766921997,
"learning_rate": 8.04518716920466e-07,
"loss": 1.2052,
"step": 48000
},
{
"epoch": 0.96,
"eval_loss": 1.170965313911438,
"eval_runtime": 118.8637,
"eval_samples_per_second": 53.338,
"eval_steps_per_second": 13.335,
"step": 48000
},
{
"epoch": 0.98,
"grad_norm": 1.1883560419082642,
"learning_rate": 2.0133235281156736e-07,
"loss": 1.168,
"step": 49000
},
{
"epoch": 0.98,
"eval_loss": 1.1708762645721436,
"eval_runtime": 118.2509,
"eval_samples_per_second": 53.615,
"eval_steps_per_second": 13.404,
"step": 49000
},
{
"epoch": 1.0,
"grad_norm": 1.4387354850769043,
"learning_rate": 0.0,
"loss": 1.1727,
"step": 50000
},
{
"epoch": 1.0,
"eval_loss": 1.1708616018295288,
"eval_runtime": 118.0864,
"eval_samples_per_second": 53.689,
"eval_steps_per_second": 13.422,
"step": 50000
}
],
"logging_steps": 1000,
"max_steps": 50000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.132424105984e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}