“Sara
adding model files
be06458
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8,
"eval_steps": 50,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 6.376349449157715,
"learning_rate": 2.5e-06,
"loss": 1.4789,
"step": 50
},
{
"epoch": 0.02,
"eval_loss": 1.0021060705184937,
"eval_runtime": 2.0714,
"eval_samples_per_second": 55.034,
"eval_steps_per_second": 2.897,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 2.7532732486724854,
"learning_rate": 5e-06,
"loss": 1.125,
"step": 100
},
{
"epoch": 0.04,
"eval_loss": 0.8670538067817688,
"eval_runtime": 2.0412,
"eval_samples_per_second": 55.848,
"eval_steps_per_second": 2.939,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 2.5737497806549072,
"learning_rate": 7.5e-06,
"loss": 0.9982,
"step": 150
},
{
"epoch": 0.06,
"eval_loss": 0.85986328125,
"eval_runtime": 2.066,
"eval_samples_per_second": 55.18,
"eval_steps_per_second": 2.904,
"step": 150
},
{
"epoch": 0.08,
"grad_norm": 3.524716377258301,
"learning_rate": 1e-05,
"loss": 0.9746,
"step": 200
},
{
"epoch": 0.08,
"eval_loss": 0.8621886968612671,
"eval_runtime": 2.0577,
"eval_samples_per_second": 55.402,
"eval_steps_per_second": 2.916,
"step": 200
},
{
"epoch": 0.1,
"grad_norm": 2.9173636436462402,
"learning_rate": 1.25e-05,
"loss": 0.9435,
"step": 250
},
{
"epoch": 0.1,
"eval_loss": 0.8596158623695374,
"eval_runtime": 2.0562,
"eval_samples_per_second": 55.441,
"eval_steps_per_second": 2.918,
"step": 250
},
{
"epoch": 0.12,
"grad_norm": 3.21992564201355,
"learning_rate": 1.5e-05,
"loss": 0.9561,
"step": 300
},
{
"epoch": 0.12,
"eval_loss": 0.8649560809135437,
"eval_runtime": 2.0472,
"eval_samples_per_second": 55.687,
"eval_steps_per_second": 2.931,
"step": 300
},
{
"epoch": 0.14,
"grad_norm": 2.842764139175415,
"learning_rate": 1.75e-05,
"loss": 0.9625,
"step": 350
},
{
"epoch": 0.14,
"eval_loss": 0.8620312213897705,
"eval_runtime": 2.0407,
"eval_samples_per_second": 55.863,
"eval_steps_per_second": 2.94,
"step": 350
},
{
"epoch": 0.16,
"grad_norm": 3.064265012741089,
"learning_rate": 2e-05,
"loss": 0.9561,
"step": 400
},
{
"epoch": 0.16,
"eval_loss": 0.8754067420959473,
"eval_runtime": 2.0491,
"eval_samples_per_second": 55.634,
"eval_steps_per_second": 2.928,
"step": 400
},
{
"epoch": 0.18,
"grad_norm": 3.680624008178711,
"learning_rate": 2.25e-05,
"loss": 0.9811,
"step": 450
},
{
"epoch": 0.18,
"eval_loss": 0.8749663829803467,
"eval_runtime": 2.0344,
"eval_samples_per_second": 56.036,
"eval_steps_per_second": 2.949,
"step": 450
},
{
"epoch": 0.2,
"grad_norm": 2.928382396697998,
"learning_rate": 2.5e-05,
"loss": 0.9841,
"step": 500
},
{
"epoch": 0.2,
"eval_loss": 0.8785499930381775,
"eval_runtime": 2.0435,
"eval_samples_per_second": 55.786,
"eval_steps_per_second": 2.936,
"step": 500
},
{
"epoch": 0.22,
"grad_norm": 3.388023853302002,
"learning_rate": 2.7500000000000004e-05,
"loss": 1.0304,
"step": 550
},
{
"epoch": 0.22,
"eval_loss": 0.8839182257652283,
"eval_runtime": 2.0365,
"eval_samples_per_second": 55.979,
"eval_steps_per_second": 2.946,
"step": 550
},
{
"epoch": 0.24,
"grad_norm": 3.6457326412200928,
"learning_rate": 3e-05,
"loss": 1.0091,
"step": 600
},
{
"epoch": 0.24,
"eval_loss": 0.8980669975280762,
"eval_runtime": 2.0434,
"eval_samples_per_second": 55.79,
"eval_steps_per_second": 2.936,
"step": 600
},
{
"epoch": 0.26,
"grad_norm": 2.8867459297180176,
"learning_rate": 3.2500000000000004e-05,
"loss": 1.0373,
"step": 650
},
{
"epoch": 0.26,
"eval_loss": 0.8973696231842041,
"eval_runtime": 2.0404,
"eval_samples_per_second": 55.871,
"eval_steps_per_second": 2.941,
"step": 650
},
{
"epoch": 0.28,
"grad_norm": 2.924246311187744,
"learning_rate": 3.5e-05,
"loss": 1.042,
"step": 700
},
{
"epoch": 0.28,
"eval_loss": 0.9169337153434753,
"eval_runtime": 2.0757,
"eval_samples_per_second": 54.922,
"eval_steps_per_second": 2.891,
"step": 700
},
{
"epoch": 0.3,
"grad_norm": 3.8138821125030518,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.0676,
"step": 750
},
{
"epoch": 0.3,
"eval_loss": 0.9358024001121521,
"eval_runtime": 2.0481,
"eval_samples_per_second": 55.661,
"eval_steps_per_second": 2.93,
"step": 750
},
{
"epoch": 0.32,
"grad_norm": 2.544848918914795,
"learning_rate": 4e-05,
"loss": 1.1405,
"step": 800
},
{
"epoch": 0.32,
"eval_loss": 0.9551197290420532,
"eval_runtime": 2.0608,
"eval_samples_per_second": 55.319,
"eval_steps_per_second": 2.912,
"step": 800
},
{
"epoch": 0.34,
"grad_norm": 3.607945203781128,
"learning_rate": 4.25e-05,
"loss": 1.1238,
"step": 850
},
{
"epoch": 0.34,
"eval_loss": 0.9666525721549988,
"eval_runtime": 2.0401,
"eval_samples_per_second": 55.879,
"eval_steps_per_second": 2.941,
"step": 850
},
{
"epoch": 0.36,
"grad_norm": 2.847774028778076,
"learning_rate": 4.5e-05,
"loss": 1.134,
"step": 900
},
{
"epoch": 0.36,
"eval_loss": 1.0043387413024902,
"eval_runtime": 2.0654,
"eval_samples_per_second": 55.196,
"eval_steps_per_second": 2.905,
"step": 900
},
{
"epoch": 0.38,
"grad_norm": 3.035200595855713,
"learning_rate": 4.75e-05,
"loss": 1.1589,
"step": 950
},
{
"epoch": 0.38,
"eval_loss": 1.0095112323760986,
"eval_runtime": 2.103,
"eval_samples_per_second": 54.208,
"eval_steps_per_second": 2.853,
"step": 950
},
{
"epoch": 0.4,
"grad_norm": 2.9163053035736084,
"learning_rate": 5e-05,
"loss": 1.1372,
"step": 1000
},
{
"epoch": 0.4,
"eval_loss": 1.0114259719848633,
"eval_runtime": 2.0471,
"eval_samples_per_second": 55.688,
"eval_steps_per_second": 2.931,
"step": 1000
},
{
"epoch": 0.42,
"grad_norm": 2.7075846195220947,
"learning_rate": 4.9996192378909786e-05,
"loss": 1.2279,
"step": 1050
},
{
"epoch": 0.42,
"eval_loss": 1.0414971113204956,
"eval_runtime": 2.0438,
"eval_samples_per_second": 55.778,
"eval_steps_per_second": 2.936,
"step": 1050
},
{
"epoch": 0.44,
"grad_norm": 3.5521507263183594,
"learning_rate": 4.99847706754774e-05,
"loss": 1.2282,
"step": 1100
},
{
"epoch": 0.44,
"eval_loss": 1.0306421518325806,
"eval_runtime": 2.0353,
"eval_samples_per_second": 56.013,
"eval_steps_per_second": 2.948,
"step": 1100
},
{
"epoch": 0.46,
"grad_norm": 2.973623037338257,
"learning_rate": 4.996573836886435e-05,
"loss": 1.2439,
"step": 1150
},
{
"epoch": 0.46,
"eval_loss": 1.0502970218658447,
"eval_runtime": 2.0477,
"eval_samples_per_second": 55.673,
"eval_steps_per_second": 2.93,
"step": 1150
},
{
"epoch": 0.48,
"grad_norm": 3.2443981170654297,
"learning_rate": 4.993910125649561e-05,
"loss": 1.2674,
"step": 1200
},
{
"epoch": 0.48,
"eval_loss": 1.052767038345337,
"eval_runtime": 2.0476,
"eval_samples_per_second": 55.676,
"eval_steps_per_second": 2.93,
"step": 1200
},
{
"epoch": 0.5,
"grad_norm": 2.528109073638916,
"learning_rate": 4.990486745229364e-05,
"loss": 1.2429,
"step": 1250
},
{
"epoch": 0.5,
"eval_loss": 1.0944527387619019,
"eval_runtime": 2.0463,
"eval_samples_per_second": 55.711,
"eval_steps_per_second": 2.932,
"step": 1250
},
{
"epoch": 0.52,
"grad_norm": 3.5652294158935547,
"learning_rate": 4.9863047384206835e-05,
"loss": 1.2405,
"step": 1300
},
{
"epoch": 0.52,
"eval_loss": 1.0788123607635498,
"eval_runtime": 2.0584,
"eval_samples_per_second": 55.383,
"eval_steps_per_second": 2.915,
"step": 1300
},
{
"epoch": 0.54,
"grad_norm": 3.1418027877807617,
"learning_rate": 4.9813653791033057e-05,
"loss": 1.2664,
"step": 1350
},
{
"epoch": 0.54,
"eval_loss": 1.077215552330017,
"eval_runtime": 2.0417,
"eval_samples_per_second": 55.836,
"eval_steps_per_second": 2.939,
"step": 1350
},
{
"epoch": 0.56,
"grad_norm": 3.247063159942627,
"learning_rate": 4.975670171853926e-05,
"loss": 1.2368,
"step": 1400
},
{
"epoch": 0.56,
"eval_loss": 1.0988303422927856,
"eval_runtime": 2.0525,
"eval_samples_per_second": 55.543,
"eval_steps_per_second": 2.923,
"step": 1400
},
{
"epoch": 0.58,
"grad_norm": 2.791402816772461,
"learning_rate": 4.9692208514878444e-05,
"loss": 1.214,
"step": 1450
},
{
"epoch": 0.58,
"eval_loss": 1.093959093093872,
"eval_runtime": 2.0478,
"eval_samples_per_second": 55.67,
"eval_steps_per_second": 2.93,
"step": 1450
},
{
"epoch": 0.6,
"grad_norm": 3.39119815826416,
"learning_rate": 4.962019382530521e-05,
"loss": 1.2605,
"step": 1500
},
{
"epoch": 0.6,
"eval_loss": 1.0913000106811523,
"eval_runtime": 2.0609,
"eval_samples_per_second": 55.317,
"eval_steps_per_second": 2.911,
"step": 1500
},
{
"epoch": 0.62,
"grad_norm": 2.8593010902404785,
"learning_rate": 4.9540679586191605e-05,
"loss": 1.2856,
"step": 1550
},
{
"epoch": 0.62,
"eval_loss": 1.1060646772384644,
"eval_runtime": 2.0505,
"eval_samples_per_second": 55.597,
"eval_steps_per_second": 2.926,
"step": 1550
},
{
"epoch": 0.64,
"grad_norm": 3.9253203868865967,
"learning_rate": 4.9453690018345144e-05,
"loss": 1.2385,
"step": 1600
},
{
"epoch": 0.64,
"eval_loss": 1.1065127849578857,
"eval_runtime": 2.0451,
"eval_samples_per_second": 55.743,
"eval_steps_per_second": 2.934,
"step": 1600
},
{
"epoch": 0.66,
"grad_norm": 3.433211326599121,
"learning_rate": 4.9359251619630886e-05,
"loss": 1.2696,
"step": 1650
},
{
"epoch": 0.66,
"eval_loss": 1.1171408891677856,
"eval_runtime": 2.0491,
"eval_samples_per_second": 55.635,
"eval_steps_per_second": 2.928,
"step": 1650
},
{
"epoch": 0.68,
"grad_norm": 2.958655595779419,
"learning_rate": 4.925739315689991e-05,
"loss": 1.2774,
"step": 1700
},
{
"epoch": 0.68,
"eval_loss": 1.1090198755264282,
"eval_runtime": 2.0549,
"eval_samples_per_second": 55.476,
"eval_steps_per_second": 2.92,
"step": 1700
},
{
"epoch": 0.7,
"grad_norm": 2.845395565032959,
"learning_rate": 4.914814565722671e-05,
"loss": 1.2598,
"step": 1750
},
{
"epoch": 0.7,
"eval_loss": 1.1252377033233643,
"eval_runtime": 2.0564,
"eval_samples_per_second": 55.437,
"eval_steps_per_second": 2.918,
"step": 1750
},
{
"epoch": 0.72,
"grad_norm": 3.3043181896209717,
"learning_rate": 4.9031542398457974e-05,
"loss": 1.2897,
"step": 1800
},
{
"epoch": 0.72,
"eval_loss": 1.1197612285614014,
"eval_runtime": 2.0511,
"eval_samples_per_second": 55.58,
"eval_steps_per_second": 2.925,
"step": 1800
},
{
"epoch": 0.74,
"grad_norm": 2.8986546993255615,
"learning_rate": 4.890761889907589e-05,
"loss": 1.2801,
"step": 1850
},
{
"epoch": 0.74,
"eval_loss": 1.0936975479125977,
"eval_runtime": 2.0514,
"eval_samples_per_second": 55.57,
"eval_steps_per_second": 2.925,
"step": 1850
},
{
"epoch": 0.76,
"grad_norm": 2.980234384536743,
"learning_rate": 4.877641290737884e-05,
"loss": 1.2732,
"step": 1900
},
{
"epoch": 0.76,
"eval_loss": 1.1040586233139038,
"eval_runtime": 2.0664,
"eval_samples_per_second": 55.169,
"eval_steps_per_second": 2.904,
"step": 1900
},
{
"epoch": 0.78,
"grad_norm": 2.337164878845215,
"learning_rate": 4.8637964389982926e-05,
"loss": 1.2395,
"step": 1950
},
{
"epoch": 0.78,
"eval_loss": 1.1020458936691284,
"eval_runtime": 2.0426,
"eval_samples_per_second": 55.811,
"eval_steps_per_second": 2.937,
"step": 1950
},
{
"epoch": 0.8,
"grad_norm": 2.535869836807251,
"learning_rate": 4.849231551964771e-05,
"loss": 1.2581,
"step": 2000
},
{
"epoch": 0.8,
"eval_loss": 1.106950283050537,
"eval_runtime": 2.0587,
"eval_samples_per_second": 55.375,
"eval_steps_per_second": 2.914,
"step": 2000
}
],
"logging_steps": 50,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 2000,
"total_flos": 7.650574067145114e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}