“Sara
adding model files
d29f777
raw
history blame contribute delete
No virus
18.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4,
"eval_steps": 10,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016,
"eval_loss": 1.1534295082092285,
"eval_runtime": 3.0421,
"eval_samples_per_second": 51.28,
"eval_steps_per_second": 2.63,
"step": 2
},
{
"epoch": 0.008,
"grad_norm": 20.25132179260254,
"learning_rate": 2.4000000000000003e-06,
"loss": 2.0304,
"step": 10
},
{
"epoch": 0.008,
"eval_loss": 1.14567232131958,
"eval_runtime": 3.0142,
"eval_samples_per_second": 51.755,
"eval_steps_per_second": 2.654,
"step": 10
},
{
"epoch": 0.016,
"grad_norm": 13.503382682800293,
"learning_rate": 4.800000000000001e-06,
"loss": 1.5982,
"step": 20
},
{
"epoch": 0.016,
"eval_loss": 1.1035062074661255,
"eval_runtime": 3.0105,
"eval_samples_per_second": 51.818,
"eval_steps_per_second": 2.657,
"step": 20
},
{
"epoch": 0.024,
"grad_norm": 6.730069160461426,
"learning_rate": 7.2e-06,
"loss": 1.3721,
"step": 30
},
{
"epoch": 0.024,
"eval_loss": 1.038061499595642,
"eval_runtime": 3.0065,
"eval_samples_per_second": 51.887,
"eval_steps_per_second": 2.661,
"step": 30
},
{
"epoch": 0.032,
"grad_norm": 5.994906902313232,
"learning_rate": 9.600000000000001e-06,
"loss": 1.1836,
"step": 40
},
{
"epoch": 0.032,
"eval_loss": 0.9870163202285767,
"eval_runtime": 3.0074,
"eval_samples_per_second": 51.871,
"eval_steps_per_second": 2.66,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 3.078227996826172,
"learning_rate": 1.2e-05,
"loss": 1.1024,
"step": 50
},
{
"epoch": 0.04,
"eval_loss": 0.9939066767692566,
"eval_runtime": 3.0144,
"eval_samples_per_second": 51.751,
"eval_steps_per_second": 2.654,
"step": 50
},
{
"epoch": 0.048,
"grad_norm": 2.8209125995635986,
"learning_rate": 1.44e-05,
"loss": 1.0809,
"step": 60
},
{
"epoch": 0.048,
"eval_loss": 0.995045006275177,
"eval_runtime": 3.0074,
"eval_samples_per_second": 51.872,
"eval_steps_per_second": 2.66,
"step": 60
},
{
"epoch": 0.056,
"grad_norm": 2.684706926345825,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.0544,
"step": 70
},
{
"epoch": 0.056,
"eval_loss": 0.9916963577270508,
"eval_runtime": 3.0125,
"eval_samples_per_second": 51.785,
"eval_steps_per_second": 2.656,
"step": 70
},
{
"epoch": 0.064,
"grad_norm": 2.3479392528533936,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.0595,
"step": 80
},
{
"epoch": 0.064,
"eval_loss": 0.9863230586051941,
"eval_runtime": 3.0094,
"eval_samples_per_second": 51.838,
"eval_steps_per_second": 2.658,
"step": 80
},
{
"epoch": 0.072,
"grad_norm": 2.6608166694641113,
"learning_rate": 2.16e-05,
"loss": 1.0859,
"step": 90
},
{
"epoch": 0.072,
"eval_loss": 0.9974517822265625,
"eval_runtime": 3.012,
"eval_samples_per_second": 51.793,
"eval_steps_per_second": 2.656,
"step": 90
},
{
"epoch": 0.08,
"grad_norm": 2.8634560108184814,
"learning_rate": 2.4e-05,
"loss": 1.0728,
"step": 100
},
{
"epoch": 0.08,
"eval_loss": 1.0015454292297363,
"eval_runtime": 3.0065,
"eval_samples_per_second": 51.887,
"eval_steps_per_second": 2.661,
"step": 100
},
{
"epoch": 0.088,
"grad_norm": 2.4156150817871094,
"learning_rate": 2.64e-05,
"loss": 1.0752,
"step": 110
},
{
"epoch": 0.088,
"eval_loss": 1.0033469200134277,
"eval_runtime": 3.0157,
"eval_samples_per_second": 51.729,
"eval_steps_per_second": 2.653,
"step": 110
},
{
"epoch": 0.096,
"grad_norm": 2.4899582862854004,
"learning_rate": 2.88e-05,
"loss": 1.0586,
"step": 120
},
{
"epoch": 0.096,
"eval_loss": 1.0045567750930786,
"eval_runtime": 3.011,
"eval_samples_per_second": 51.81,
"eval_steps_per_second": 2.657,
"step": 120
},
{
"epoch": 0.104,
"grad_norm": 2.4391989707946777,
"learning_rate": 2.9998537860139564e-05,
"loss": 1.0549,
"step": 130
},
{
"epoch": 0.104,
"eval_loss": 1.0204839706420898,
"eval_runtime": 3.0118,
"eval_samples_per_second": 51.797,
"eval_steps_per_second": 2.656,
"step": 130
},
{
"epoch": 0.112,
"grad_norm": 2.5839991569519043,
"learning_rate": 2.9986842451482876e-05,
"loss": 1.137,
"step": 140
},
{
"epoch": 0.112,
"eval_loss": 1.0195070505142212,
"eval_runtime": 3.0108,
"eval_samples_per_second": 51.814,
"eval_steps_per_second": 2.657,
"step": 140
},
{
"epoch": 0.12,
"grad_norm": 4.575782299041748,
"learning_rate": 2.9963460753897364e-05,
"loss": 1.1321,
"step": 150
},
{
"epoch": 0.12,
"eval_loss": 1.031137228012085,
"eval_runtime": 3.0126,
"eval_samples_per_second": 51.782,
"eval_steps_per_second": 2.656,
"step": 150
},
{
"epoch": 0.128,
"grad_norm": 3.309042453765869,
"learning_rate": 2.992841099972747e-05,
"loss": 1.148,
"step": 160
},
{
"epoch": 0.128,
"eval_loss": 1.0269774198532104,
"eval_runtime": 3.0093,
"eval_samples_per_second": 51.839,
"eval_steps_per_second": 2.658,
"step": 160
},
{
"epoch": 0.136,
"grad_norm": 2.796513319015503,
"learning_rate": 2.988172051971717e-05,
"loss": 1.0891,
"step": 170
},
{
"epoch": 0.136,
"eval_loss": 1.0196280479431152,
"eval_runtime": 3.0133,
"eval_samples_per_second": 51.77,
"eval_steps_per_second": 2.655,
"step": 170
},
{
"epoch": 0.144,
"grad_norm": 3.2158987522125244,
"learning_rate": 2.9823425721698293e-05,
"loss": 1.1017,
"step": 180
},
{
"epoch": 0.144,
"eval_loss": 1.031247615814209,
"eval_runtime": 3.0065,
"eval_samples_per_second": 51.888,
"eval_steps_per_second": 2.661,
"step": 180
},
{
"epoch": 0.152,
"grad_norm": 2.686189889907837,
"learning_rate": 2.975357206220079e-05,
"loss": 1.0981,
"step": 190
},
{
"epoch": 0.152,
"eval_loss": 1.0395549535751343,
"eval_runtime": 3.0239,
"eval_samples_per_second": 51.589,
"eval_steps_per_second": 2.646,
"step": 190
},
{
"epoch": 0.16,
"grad_norm": 2.959608316421509,
"learning_rate": 2.9672214011007087e-05,
"loss": 1.0892,
"step": 200
},
{
"epoch": 0.16,
"eval_loss": 1.0621881484985352,
"eval_runtime": 3.01,
"eval_samples_per_second": 51.827,
"eval_steps_per_second": 2.658,
"step": 200
},
{
"epoch": 0.168,
"grad_norm": 2.3781332969665527,
"learning_rate": 2.9579415008678196e-05,
"loss": 1.1321,
"step": 210
},
{
"epoch": 0.168,
"eval_loss": 1.0873780250549316,
"eval_runtime": 3.0093,
"eval_samples_per_second": 51.839,
"eval_steps_per_second": 2.658,
"step": 210
},
{
"epoch": 0.176,
"grad_norm": 3.1061549186706543,
"learning_rate": 2.9475247417084672e-05,
"loss": 1.1245,
"step": 220
},
{
"epoch": 0.176,
"eval_loss": 1.0836576223373413,
"eval_runtime": 3.0092,
"eval_samples_per_second": 51.842,
"eval_steps_per_second": 2.659,
"step": 220
},
{
"epoch": 0.184,
"grad_norm": 3.747018337249756,
"learning_rate": 2.9359792462981007e-05,
"loss": 1.1511,
"step": 230
},
{
"epoch": 0.184,
"eval_loss": 1.0861718654632568,
"eval_runtime": 3.0167,
"eval_samples_per_second": 51.713,
"eval_steps_per_second": 2.652,
"step": 230
},
{
"epoch": 0.192,
"grad_norm": 2.8736958503723145,
"learning_rate": 2.923314017466745e-05,
"loss": 1.1321,
"step": 240
},
{
"epoch": 0.192,
"eval_loss": 1.0627102851867676,
"eval_runtime": 3.0136,
"eval_samples_per_second": 51.765,
"eval_steps_per_second": 2.655,
"step": 240
},
{
"epoch": 0.2,
"grad_norm": 2.3447535037994385,
"learning_rate": 2.9095389311788626e-05,
"loss": 1.0972,
"step": 250
},
{
"epoch": 0.2,
"eval_loss": 1.0548475980758667,
"eval_runtime": 3.011,
"eval_samples_per_second": 51.811,
"eval_steps_per_second": 2.657,
"step": 250
},
{
"epoch": 0.208,
"grad_norm": 2.6532089710235596,
"learning_rate": 2.894664728832377e-05,
"loss": 1.1624,
"step": 260
},
{
"epoch": 0.208,
"eval_loss": 1.059614896774292,
"eval_runtime": 3.0099,
"eval_samples_per_second": 51.829,
"eval_steps_per_second": 2.658,
"step": 260
},
{
"epoch": 0.216,
"grad_norm": 3.049149751663208,
"learning_rate": 2.8787030088828517e-05,
"loss": 1.089,
"step": 270
},
{
"epoch": 0.216,
"eval_loss": 1.0867388248443604,
"eval_runtime": 3.0057,
"eval_samples_per_second": 51.901,
"eval_steps_per_second": 2.662,
"step": 270
},
{
"epoch": 0.224,
"grad_norm": 2.3557188510894775,
"learning_rate": 2.8616662177993633e-05,
"loss": 1.0937,
"step": 280
},
{
"epoch": 0.224,
"eval_loss": 1.0865511894226074,
"eval_runtime": 3.0136,
"eval_samples_per_second": 51.766,
"eval_steps_per_second": 2.655,
"step": 280
},
{
"epoch": 0.232,
"grad_norm": 2.5818262100219727,
"learning_rate": 2.8435676403591193e-05,
"loss": 1.0708,
"step": 290
},
{
"epoch": 0.232,
"eval_loss": 1.0967109203338623,
"eval_runtime": 3.0116,
"eval_samples_per_second": 51.8,
"eval_steps_per_second": 2.656,
"step": 290
},
{
"epoch": 0.24,
"grad_norm": 3.1398000717163086,
"learning_rate": 2.8244213892883907e-05,
"loss": 1.0921,
"step": 300
},
{
"epoch": 0.24,
"eval_loss": 1.1191679239273071,
"eval_runtime": 3.0116,
"eval_samples_per_second": 51.8,
"eval_steps_per_second": 2.656,
"step": 300
},
{
"epoch": 0.248,
"grad_norm": 2.4209301471710205,
"learning_rate": 2.8042423942578285e-05,
"loss": 1.1155,
"step": 310
},
{
"epoch": 0.248,
"eval_loss": 1.1559284925460815,
"eval_runtime": 3.0159,
"eval_samples_per_second": 51.725,
"eval_steps_per_second": 2.653,
"step": 310
},
{
"epoch": 0.256,
"grad_norm": 2.8402583599090576,
"learning_rate": 2.78304639024076e-05,
"loss": 1.1269,
"step": 320
},
{
"epoch": 0.256,
"eval_loss": 1.1549513339996338,
"eval_runtime": 3.0068,
"eval_samples_per_second": 51.882,
"eval_steps_per_second": 2.661,
"step": 320
},
{
"epoch": 0.264,
"grad_norm": 2.46132755279541,
"learning_rate": 2.7608499052435265e-05,
"loss": 1.1088,
"step": 330
},
{
"epoch": 0.264,
"eval_loss": 1.1405725479125977,
"eval_runtime": 3.0049,
"eval_samples_per_second": 51.914,
"eval_steps_per_second": 2.662,
"step": 330
},
{
"epoch": 0.272,
"grad_norm": 2.852588176727295,
"learning_rate": 2.7376702474174428e-05,
"loss": 1.0859,
"step": 340
},
{
"epoch": 0.272,
"eval_loss": 1.1068381071090698,
"eval_runtime": 3.0037,
"eval_samples_per_second": 51.936,
"eval_steps_per_second": 2.663,
"step": 340
},
{
"epoch": 0.28,
"grad_norm": 2.3993775844573975,
"learning_rate": 2.7135254915624213e-05,
"loss": 1.1132,
"step": 350
},
{
"epoch": 0.28,
"eval_loss": 1.1167492866516113,
"eval_runtime": 3.0024,
"eval_samples_per_second": 51.959,
"eval_steps_per_second": 2.665,
"step": 350
},
{
"epoch": 0.288,
"grad_norm": 2.4123260974884033,
"learning_rate": 2.688434465032786e-05,
"loss": 1.0702,
"step": 360
},
{
"epoch": 0.288,
"eval_loss": 1.0937358140945435,
"eval_runtime": 3.0087,
"eval_samples_per_second": 51.849,
"eval_steps_per_second": 2.659,
"step": 360
},
{
"epoch": 0.296,
"grad_norm": 2.667454719543457,
"learning_rate": 2.6624167330562697e-05,
"loss": 1.1427,
"step": 370
},
{
"epoch": 0.296,
"eval_loss": 1.1052156686782837,
"eval_runtime": 3.0045,
"eval_samples_per_second": 51.922,
"eval_steps_per_second": 2.663,
"step": 370
},
{
"epoch": 0.304,
"grad_norm": 2.4109561443328857,
"learning_rate": 2.6354925834776346e-05,
"loss": 1.1124,
"step": 380
},
{
"epoch": 0.304,
"eval_loss": 1.1146537065505981,
"eval_runtime": 3.0047,
"eval_samples_per_second": 51.918,
"eval_steps_per_second": 2.662,
"step": 380
},
{
"epoch": 0.312,
"grad_norm": 2.661153554916382,
"learning_rate": 2.607683010938826e-05,
"loss": 1.1431,
"step": 390
},
{
"epoch": 0.312,
"eval_loss": 1.139147400856018,
"eval_runtime": 3.0049,
"eval_samples_per_second": 51.915,
"eval_steps_per_second": 2.662,
"step": 390
},
{
"epoch": 0.32,
"grad_norm": 2.762219190597534,
"learning_rate": 2.5790097005079766e-05,
"loss": 1.1471,
"step": 400
},
{
"epoch": 0.32,
"eval_loss": 1.1363451480865479,
"eval_runtime": 3.012,
"eval_samples_per_second": 51.792,
"eval_steps_per_second": 2.656,
"step": 400
},
{
"epoch": 0.328,
"grad_norm": 2.473832368850708,
"learning_rate": 2.5494950107700482e-05,
"loss": 1.1466,
"step": 410
},
{
"epoch": 0.328,
"eval_loss": 1.1255215406417847,
"eval_runtime": 3.0084,
"eval_samples_per_second": 51.855,
"eval_steps_per_second": 2.659,
"step": 410
},
{
"epoch": 0.336,
"grad_norm": 2.8688554763793945,
"learning_rate": 2.519161956392275e-05,
"loss": 1.0485,
"step": 420
},
{
"epoch": 0.336,
"eval_loss": 1.1306232213974,
"eval_runtime": 3.0061,
"eval_samples_per_second": 51.894,
"eval_steps_per_second": 2.661,
"step": 420
},
{
"epoch": 0.344,
"grad_norm": 2.7669804096221924,
"learning_rate": 2.4880341901780205e-05,
"loss": 1.0817,
"step": 430
},
{
"epoch": 0.344,
"eval_loss": 1.124632716178894,
"eval_runtime": 3.0118,
"eval_samples_per_second": 51.797,
"eval_steps_per_second": 2.656,
"step": 430
},
{
"epoch": 0.352,
"grad_norm": 2.8019163608551025,
"learning_rate": 2.4561359846230346e-05,
"loss": 1.0918,
"step": 440
},
{
"epoch": 0.352,
"eval_loss": 1.1212003231048584,
"eval_runtime": 3.0142,
"eval_samples_per_second": 51.755,
"eval_steps_per_second": 2.654,
"step": 440
},
{
"epoch": 0.36,
"grad_norm": 3.220196485519409,
"learning_rate": 2.4234922129884873e-05,
"loss": 1.0958,
"step": 450
},
{
"epoch": 0.36,
"eval_loss": 1.1051884889602661,
"eval_runtime": 3.0146,
"eval_samples_per_second": 51.748,
"eval_steps_per_second": 2.654,
"step": 450
},
{
"epoch": 0.368,
"grad_norm": 2.7096750736236572,
"learning_rate": 2.3901283299055524e-05,
"loss": 1.0681,
"step": 460
},
{
"epoch": 0.368,
"eval_loss": 1.0787197351455688,
"eval_runtime": 3.0111,
"eval_samples_per_second": 51.809,
"eval_steps_per_second": 2.657,
"step": 460
},
{
"epoch": 0.376,
"grad_norm": 2.8523542881011963,
"learning_rate": 2.356070351526648e-05,
"loss": 1.095,
"step": 470
},
{
"epoch": 0.376,
"eval_loss": 1.0948532819747925,
"eval_runtime": 3.004,
"eval_samples_per_second": 51.931,
"eval_steps_per_second": 2.663,
"step": 470
},
{
"epoch": 0.384,
"grad_norm": 3.2367355823516846,
"learning_rate": 2.3213448352388256e-05,
"loss": 1.0575,
"step": 480
},
{
"epoch": 0.384,
"eval_loss": 1.082040786743164,
"eval_runtime": 3.0033,
"eval_samples_per_second": 51.943,
"eval_steps_per_second": 2.664,
"step": 480
},
{
"epoch": 0.392,
"grad_norm": 2.410609006881714,
"learning_rate": 2.285978858955119e-05,
"loss": 1.0265,
"step": 490
},
{
"epoch": 0.392,
"eval_loss": 1.0808920860290527,
"eval_runtime": 3.0115,
"eval_samples_per_second": 51.801,
"eval_steps_per_second": 2.656,
"step": 490
},
{
"epoch": 0.4,
"grad_norm": 3.2114365100860596,
"learning_rate": 2.25e-05,
"loss": 1.0819,
"step": 500
},
{
"epoch": 0.4,
"eval_loss": 1.0693764686584473,
"eval_runtime": 3.009,
"eval_samples_per_second": 51.844,
"eval_steps_per_second": 2.659,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 2.715405190483149e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}