|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4, |
|
"eval_steps": 10, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0016, |
|
"eval_loss": 1.1534295082092285, |
|
"eval_runtime": 3.0421, |
|
"eval_samples_per_second": 51.28, |
|
"eval_steps_per_second": 2.63, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 20.25132179260254, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 2.0304, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"eval_loss": 1.14567232131958, |
|
"eval_runtime": 3.0142, |
|
"eval_samples_per_second": 51.755, |
|
"eval_steps_per_second": 2.654, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 13.503382682800293, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.5982, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"eval_loss": 1.1035062074661255, |
|
"eval_runtime": 3.0105, |
|
"eval_samples_per_second": 51.818, |
|
"eval_steps_per_second": 2.657, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 6.730069160461426, |
|
"learning_rate": 7.2e-06, |
|
"loss": 1.3721, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"eval_loss": 1.038061499595642, |
|
"eval_runtime": 3.0065, |
|
"eval_samples_per_second": 51.887, |
|
"eval_steps_per_second": 2.661, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 5.994906902313232, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 1.1836, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"eval_loss": 0.9870163202285767, |
|
"eval_runtime": 3.0074, |
|
"eval_samples_per_second": 51.871, |
|
"eval_steps_per_second": 2.66, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.078227996826172, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.1024, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.9939066767692566, |
|
"eval_runtime": 3.0144, |
|
"eval_samples_per_second": 51.751, |
|
"eval_steps_per_second": 2.654, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 2.8209125995635986, |
|
"learning_rate": 1.44e-05, |
|
"loss": 1.0809, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"eval_loss": 0.995045006275177, |
|
"eval_runtime": 3.0074, |
|
"eval_samples_per_second": 51.872, |
|
"eval_steps_per_second": 2.66, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 2.684706926345825, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 1.0544, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"eval_loss": 0.9916963577270508, |
|
"eval_runtime": 3.0125, |
|
"eval_samples_per_second": 51.785, |
|
"eval_steps_per_second": 2.656, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 2.3479392528533936, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 1.0595, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"eval_loss": 0.9863230586051941, |
|
"eval_runtime": 3.0094, |
|
"eval_samples_per_second": 51.838, |
|
"eval_steps_per_second": 2.658, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 2.6608166694641113, |
|
"learning_rate": 2.16e-05, |
|
"loss": 1.0859, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"eval_loss": 0.9974517822265625, |
|
"eval_runtime": 3.012, |
|
"eval_samples_per_second": 51.793, |
|
"eval_steps_per_second": 2.656, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.8634560108184814, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.0728, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 1.0015454292297363, |
|
"eval_runtime": 3.0065, |
|
"eval_samples_per_second": 51.887, |
|
"eval_steps_per_second": 2.661, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 2.4156150817871094, |
|
"learning_rate": 2.64e-05, |
|
"loss": 1.0752, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"eval_loss": 1.0033469200134277, |
|
"eval_runtime": 3.0157, |
|
"eval_samples_per_second": 51.729, |
|
"eval_steps_per_second": 2.653, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 2.4899582862854004, |
|
"learning_rate": 2.88e-05, |
|
"loss": 1.0586, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"eval_loss": 1.0045567750930786, |
|
"eval_runtime": 3.011, |
|
"eval_samples_per_second": 51.81, |
|
"eval_steps_per_second": 2.657, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 2.4391989707946777, |
|
"learning_rate": 2.9998537860139564e-05, |
|
"loss": 1.0549, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"eval_loss": 1.0204839706420898, |
|
"eval_runtime": 3.0118, |
|
"eval_samples_per_second": 51.797, |
|
"eval_steps_per_second": 2.656, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 2.5839991569519043, |
|
"learning_rate": 2.9986842451482876e-05, |
|
"loss": 1.137, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"eval_loss": 1.0195070505142212, |
|
"eval_runtime": 3.0108, |
|
"eval_samples_per_second": 51.814, |
|
"eval_steps_per_second": 2.657, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.575782299041748, |
|
"learning_rate": 2.9963460753897364e-05, |
|
"loss": 1.1321, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 1.031137228012085, |
|
"eval_runtime": 3.0126, |
|
"eval_samples_per_second": 51.782, |
|
"eval_steps_per_second": 2.656, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 3.309042453765869, |
|
"learning_rate": 2.992841099972747e-05, |
|
"loss": 1.148, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"eval_loss": 1.0269774198532104, |
|
"eval_runtime": 3.0093, |
|
"eval_samples_per_second": 51.839, |
|
"eval_steps_per_second": 2.658, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 2.796513319015503, |
|
"learning_rate": 2.988172051971717e-05, |
|
"loss": 1.0891, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"eval_loss": 1.0196280479431152, |
|
"eval_runtime": 3.0133, |
|
"eval_samples_per_second": 51.77, |
|
"eval_steps_per_second": 2.655, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 3.2158987522125244, |
|
"learning_rate": 2.9823425721698293e-05, |
|
"loss": 1.1017, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"eval_loss": 1.031247615814209, |
|
"eval_runtime": 3.0065, |
|
"eval_samples_per_second": 51.888, |
|
"eval_steps_per_second": 2.661, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 2.686189889907837, |
|
"learning_rate": 2.975357206220079e-05, |
|
"loss": 1.0981, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"eval_loss": 1.0395549535751343, |
|
"eval_runtime": 3.0239, |
|
"eval_samples_per_second": 51.589, |
|
"eval_steps_per_second": 2.646, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.959608316421509, |
|
"learning_rate": 2.9672214011007087e-05, |
|
"loss": 1.0892, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.0621881484985352, |
|
"eval_runtime": 3.01, |
|
"eval_samples_per_second": 51.827, |
|
"eval_steps_per_second": 2.658, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 2.3781332969665527, |
|
"learning_rate": 2.9579415008678196e-05, |
|
"loss": 1.1321, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"eval_loss": 1.0873780250549316, |
|
"eval_runtime": 3.0093, |
|
"eval_samples_per_second": 51.839, |
|
"eval_steps_per_second": 2.658, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 3.1061549186706543, |
|
"learning_rate": 2.9475247417084672e-05, |
|
"loss": 1.1245, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"eval_loss": 1.0836576223373413, |
|
"eval_runtime": 3.0092, |
|
"eval_samples_per_second": 51.842, |
|
"eval_steps_per_second": 2.659, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 3.747018337249756, |
|
"learning_rate": 2.9359792462981007e-05, |
|
"loss": 1.1511, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"eval_loss": 1.0861718654632568, |
|
"eval_runtime": 3.0167, |
|
"eval_samples_per_second": 51.713, |
|
"eval_steps_per_second": 2.652, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 2.8736958503723145, |
|
"learning_rate": 2.923314017466745e-05, |
|
"loss": 1.1321, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"eval_loss": 1.0627102851867676, |
|
"eval_runtime": 3.0136, |
|
"eval_samples_per_second": 51.765, |
|
"eval_steps_per_second": 2.655, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.3447535037994385, |
|
"learning_rate": 2.9095389311788626e-05, |
|
"loss": 1.0972, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.0548475980758667, |
|
"eval_runtime": 3.011, |
|
"eval_samples_per_second": 51.811, |
|
"eval_steps_per_second": 2.657, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 2.6532089710235596, |
|
"learning_rate": 2.894664728832377e-05, |
|
"loss": 1.1624, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"eval_loss": 1.059614896774292, |
|
"eval_runtime": 3.0099, |
|
"eval_samples_per_second": 51.829, |
|
"eval_steps_per_second": 2.658, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 3.049149751663208, |
|
"learning_rate": 2.8787030088828517e-05, |
|
"loss": 1.089, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"eval_loss": 1.0867388248443604, |
|
"eval_runtime": 3.0057, |
|
"eval_samples_per_second": 51.901, |
|
"eval_steps_per_second": 2.662, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 2.3557188510894775, |
|
"learning_rate": 2.8616662177993633e-05, |
|
"loss": 1.0937, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"eval_loss": 1.0865511894226074, |
|
"eval_runtime": 3.0136, |
|
"eval_samples_per_second": 51.766, |
|
"eval_steps_per_second": 2.655, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 2.5818262100219727, |
|
"learning_rate": 2.8435676403591193e-05, |
|
"loss": 1.0708, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"eval_loss": 1.0967109203338623, |
|
"eval_runtime": 3.0116, |
|
"eval_samples_per_second": 51.8, |
|
"eval_steps_per_second": 2.656, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.1398000717163086, |
|
"learning_rate": 2.8244213892883907e-05, |
|
"loss": 1.0921, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.1191679239273071, |
|
"eval_runtime": 3.0116, |
|
"eval_samples_per_second": 51.8, |
|
"eval_steps_per_second": 2.656, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 2.4209301471710205, |
|
"learning_rate": 2.8042423942578285e-05, |
|
"loss": 1.1155, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"eval_loss": 1.1559284925460815, |
|
"eval_runtime": 3.0159, |
|
"eval_samples_per_second": 51.725, |
|
"eval_steps_per_second": 2.653, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 2.8402583599090576, |
|
"learning_rate": 2.78304639024076e-05, |
|
"loss": 1.1269, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"eval_loss": 1.1549513339996338, |
|
"eval_runtime": 3.0068, |
|
"eval_samples_per_second": 51.882, |
|
"eval_steps_per_second": 2.661, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 2.46132755279541, |
|
"learning_rate": 2.7608499052435265e-05, |
|
"loss": 1.1088, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"eval_loss": 1.1405725479125977, |
|
"eval_runtime": 3.0049, |
|
"eval_samples_per_second": 51.914, |
|
"eval_steps_per_second": 2.662, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 2.852588176727295, |
|
"learning_rate": 2.7376702474174428e-05, |
|
"loss": 1.0859, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"eval_loss": 1.1068381071090698, |
|
"eval_runtime": 3.0037, |
|
"eval_samples_per_second": 51.936, |
|
"eval_steps_per_second": 2.663, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.3993775844573975, |
|
"learning_rate": 2.7135254915624213e-05, |
|
"loss": 1.1132, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 1.1167492866516113, |
|
"eval_runtime": 3.0024, |
|
"eval_samples_per_second": 51.959, |
|
"eval_steps_per_second": 2.665, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 2.4123260974884033, |
|
"learning_rate": 2.688434465032786e-05, |
|
"loss": 1.0702, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"eval_loss": 1.0937358140945435, |
|
"eval_runtime": 3.0087, |
|
"eval_samples_per_second": 51.849, |
|
"eval_steps_per_second": 2.659, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 2.667454719543457, |
|
"learning_rate": 2.6624167330562697e-05, |
|
"loss": 1.1427, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"eval_loss": 1.1052156686782837, |
|
"eval_runtime": 3.0045, |
|
"eval_samples_per_second": 51.922, |
|
"eval_steps_per_second": 2.663, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 2.4109561443328857, |
|
"learning_rate": 2.6354925834776346e-05, |
|
"loss": 1.1124, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"eval_loss": 1.1146537065505981, |
|
"eval_runtime": 3.0047, |
|
"eval_samples_per_second": 51.918, |
|
"eval_steps_per_second": 2.662, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 2.661153554916382, |
|
"learning_rate": 2.607683010938826e-05, |
|
"loss": 1.1431, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"eval_loss": 1.139147400856018, |
|
"eval_runtime": 3.0049, |
|
"eval_samples_per_second": 51.915, |
|
"eval_steps_per_second": 2.662, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.762219190597534, |
|
"learning_rate": 2.5790097005079766e-05, |
|
"loss": 1.1471, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 1.1363451480865479, |
|
"eval_runtime": 3.012, |
|
"eval_samples_per_second": 51.792, |
|
"eval_steps_per_second": 2.656, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 2.473832368850708, |
|
"learning_rate": 2.5494950107700482e-05, |
|
"loss": 1.1466, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"eval_loss": 1.1255215406417847, |
|
"eval_runtime": 3.0084, |
|
"eval_samples_per_second": 51.855, |
|
"eval_steps_per_second": 2.659, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 2.8688554763793945, |
|
"learning_rate": 2.519161956392275e-05, |
|
"loss": 1.0485, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"eval_loss": 1.1306232213974, |
|
"eval_runtime": 3.0061, |
|
"eval_samples_per_second": 51.894, |
|
"eval_steps_per_second": 2.661, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 2.7669804096221924, |
|
"learning_rate": 2.4880341901780205e-05, |
|
"loss": 1.0817, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"eval_loss": 1.124632716178894, |
|
"eval_runtime": 3.0118, |
|
"eval_samples_per_second": 51.797, |
|
"eval_steps_per_second": 2.656, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 2.8019163608551025, |
|
"learning_rate": 2.4561359846230346e-05, |
|
"loss": 1.0918, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"eval_loss": 1.1212003231048584, |
|
"eval_runtime": 3.0142, |
|
"eval_samples_per_second": 51.755, |
|
"eval_steps_per_second": 2.654, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.220196485519409, |
|
"learning_rate": 2.4234922129884873e-05, |
|
"loss": 1.0958, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.1051884889602661, |
|
"eval_runtime": 3.0146, |
|
"eval_samples_per_second": 51.748, |
|
"eval_steps_per_second": 2.654, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 2.7096750736236572, |
|
"learning_rate": 2.3901283299055524e-05, |
|
"loss": 1.0681, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"eval_loss": 1.0787197351455688, |
|
"eval_runtime": 3.0111, |
|
"eval_samples_per_second": 51.809, |
|
"eval_steps_per_second": 2.657, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 2.8523542881011963, |
|
"learning_rate": 2.356070351526648e-05, |
|
"loss": 1.095, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"eval_loss": 1.0948532819747925, |
|
"eval_runtime": 3.004, |
|
"eval_samples_per_second": 51.931, |
|
"eval_steps_per_second": 2.663, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 3.2367355823516846, |
|
"learning_rate": 2.3213448352388256e-05, |
|
"loss": 1.0575, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"eval_loss": 1.082040786743164, |
|
"eval_runtime": 3.0033, |
|
"eval_samples_per_second": 51.943, |
|
"eval_steps_per_second": 2.664, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 2.410609006881714, |
|
"learning_rate": 2.285978858955119e-05, |
|
"loss": 1.0265, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"eval_loss": 1.0808920860290527, |
|
"eval_runtime": 3.0115, |
|
"eval_samples_per_second": 51.801, |
|
"eval_steps_per_second": 2.656, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.2114365100860596, |
|
"learning_rate": 2.25e-05, |
|
"loss": 1.0819, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 1.0693764686584473, |
|
"eval_runtime": 3.009, |
|
"eval_samples_per_second": 51.844, |
|
"eval_steps_per_second": 2.659, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 2.715405190483149e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|