|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9631901840490797, |
|
"eval_steps": 41, |
|
"global_step": 326, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006134969325153374, |
|
"grad_norm": 0.10048680007457733, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.6143, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006134969325153374, |
|
"eval_loss": 0.49658429622650146, |
|
"eval_runtime": 22.5462, |
|
"eval_samples_per_second": 8.161, |
|
"eval_steps_per_second": 1.02, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012269938650306749, |
|
"grad_norm": 0.08257655799388885, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.4536, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.018404907975460124, |
|
"grad_norm": 0.10207108408212662, |
|
"learning_rate": 3e-06, |
|
"loss": 0.5374, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.024539877300613498, |
|
"grad_norm": 0.07208588719367981, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.4189, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03067484662576687, |
|
"grad_norm": 0.08989891409873962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4773, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03680981595092025, |
|
"grad_norm": 0.07754336297512054, |
|
"learning_rate": 6e-06, |
|
"loss": 0.5562, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04294478527607362, |
|
"grad_norm": 0.10505057871341705, |
|
"learning_rate": 7e-06, |
|
"loss": 0.5093, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.049079754601226995, |
|
"grad_norm": 0.07071765512228012, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.4337, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05521472392638037, |
|
"grad_norm": 0.0959077998995781, |
|
"learning_rate": 9e-06, |
|
"loss": 0.5716, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06134969325153374, |
|
"grad_norm": 0.08926673978567123, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5525, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06748466257668712, |
|
"grad_norm": 0.09146387130022049, |
|
"learning_rate": 9.999752906107043e-06, |
|
"loss": 0.4638, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0736196319018405, |
|
"grad_norm": 0.08329001069068909, |
|
"learning_rate": 9.999011648850328e-06, |
|
"loss": 0.3994, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07975460122699386, |
|
"grad_norm": 0.09074167907238007, |
|
"learning_rate": 9.997776301493914e-06, |
|
"loss": 0.6013, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08588957055214724, |
|
"grad_norm": 0.08691710978746414, |
|
"learning_rate": 9.99604698613651e-06, |
|
"loss": 0.4411, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09202453987730061, |
|
"grad_norm": 0.09161891788244247, |
|
"learning_rate": 9.993823873699427e-06, |
|
"loss": 0.5256, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09815950920245399, |
|
"grad_norm": 0.10627757757902145, |
|
"learning_rate": 9.991107183909665e-06, |
|
"loss": 0.6318, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10429447852760736, |
|
"grad_norm": 0.09262832999229431, |
|
"learning_rate": 9.98789718527821e-06, |
|
"loss": 0.5003, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11042944785276074, |
|
"grad_norm": 0.11589068174362183, |
|
"learning_rate": 9.98419419507348e-06, |
|
"loss": 0.7078, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1165644171779141, |
|
"grad_norm": 0.10624095797538757, |
|
"learning_rate": 9.979998579289985e-06, |
|
"loss": 0.6702, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12269938650306748, |
|
"grad_norm": 0.06414097547531128, |
|
"learning_rate": 9.975310752612138e-06, |
|
"loss": 0.353, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12883435582822086, |
|
"grad_norm": 0.11902766674757004, |
|
"learning_rate": 9.970131178373276e-06, |
|
"loss": 0.5749, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.13496932515337423, |
|
"grad_norm": 0.09129296988248825, |
|
"learning_rate": 9.964460368509868e-06, |
|
"loss": 0.4984, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1411042944785276, |
|
"grad_norm": 0.09726156294345856, |
|
"learning_rate": 9.958298883510904e-06, |
|
"loss": 0.476, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.147239263803681, |
|
"grad_norm": 0.09123346209526062, |
|
"learning_rate": 9.951647332362511e-06, |
|
"loss": 0.4627, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.15337423312883436, |
|
"grad_norm": 0.11005334556102753, |
|
"learning_rate": 9.944506372487754e-06, |
|
"loss": 0.6342, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.15950920245398773, |
|
"grad_norm": 0.08414352685213089, |
|
"learning_rate": 9.936876709681668e-06, |
|
"loss": 0.4311, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1656441717791411, |
|
"grad_norm": 0.07989637553691864, |
|
"learning_rate": 9.928759098041482e-06, |
|
"loss": 0.5064, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.17177914110429449, |
|
"grad_norm": 0.08479982614517212, |
|
"learning_rate": 9.920154339892104e-06, |
|
"loss": 0.406, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17791411042944785, |
|
"grad_norm": 0.09588364511728287, |
|
"learning_rate": 9.911063285706808e-06, |
|
"loss": 0.5324, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.18404907975460122, |
|
"grad_norm": 0.10101877897977829, |
|
"learning_rate": 9.901486834023182e-06, |
|
"loss": 0.5151, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1901840490797546, |
|
"grad_norm": 0.10507562756538391, |
|
"learning_rate": 9.891425931354316e-06, |
|
"loss": 0.5903, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.19631901840490798, |
|
"grad_norm": 0.09090801328420639, |
|
"learning_rate": 9.880881572095255e-06, |
|
"loss": 0.326, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.20245398773006135, |
|
"grad_norm": 0.08976439386606216, |
|
"learning_rate": 9.869854798424709e-06, |
|
"loss": 0.3729, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2085889570552147, |
|
"grad_norm": 0.10010894387960434, |
|
"learning_rate": 9.85834670020205e-06, |
|
"loss": 0.4083, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2147239263803681, |
|
"grad_norm": 0.11611686646938324, |
|
"learning_rate": 9.846358414859598e-06, |
|
"loss": 0.46, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22085889570552147, |
|
"grad_norm": 0.08281069248914719, |
|
"learning_rate": 9.833891127290186e-06, |
|
"loss": 0.4027, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.22699386503067484, |
|
"grad_norm": 0.07023394107818604, |
|
"learning_rate": 9.820946069730067e-06, |
|
"loss": 0.3775, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2331288343558282, |
|
"grad_norm": 0.07550126314163208, |
|
"learning_rate": 9.807524521637103e-06, |
|
"loss": 0.3316, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2392638036809816, |
|
"grad_norm": 0.09305860102176666, |
|
"learning_rate": 9.793627809564324e-06, |
|
"loss": 0.5796, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.24539877300613497, |
|
"grad_norm": 0.07182462513446808, |
|
"learning_rate": 9.779257307028805e-06, |
|
"loss": 0.389, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25153374233128833, |
|
"grad_norm": 0.0985904186964035, |
|
"learning_rate": 9.76441443437591e-06, |
|
"loss": 0.5509, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.25153374233128833, |
|
"eval_loss": 0.4608064889907837, |
|
"eval_runtime": 22.8858, |
|
"eval_samples_per_second": 8.04, |
|
"eval_steps_per_second": 1.005, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.25766871165644173, |
|
"grad_norm": 0.07043192535638809, |
|
"learning_rate": 9.749100658638914e-06, |
|
"loss": 0.3804, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.26380368098159507, |
|
"grad_norm": 0.0967957079410553, |
|
"learning_rate": 9.733317493394004e-06, |
|
"loss": 0.5316, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.26993865030674846, |
|
"grad_norm": 0.07897109538316727, |
|
"learning_rate": 9.717066498610673e-06, |
|
"loss": 0.4192, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.27607361963190186, |
|
"grad_norm": 0.10683301091194153, |
|
"learning_rate": 9.700349280497552e-06, |
|
"loss": 0.5923, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2822085889570552, |
|
"grad_norm": 0.07347994297742844, |
|
"learning_rate": 9.68316749134364e-06, |
|
"loss": 0.3696, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2883435582822086, |
|
"grad_norm": 0.0915137529373169, |
|
"learning_rate": 9.665522829355005e-06, |
|
"loss": 0.5799, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.294478527607362, |
|
"grad_norm": 0.08099963515996933, |
|
"learning_rate": 9.647417038486936e-06, |
|
"loss": 0.4504, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3006134969325153, |
|
"grad_norm": 0.11759970337152481, |
|
"learning_rate": 9.628851908271572e-06, |
|
"loss": 0.6128, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3067484662576687, |
|
"grad_norm": 0.11251598596572876, |
|
"learning_rate": 9.609829273641034e-06, |
|
"loss": 0.5497, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3128834355828221, |
|
"grad_norm": 0.08890489488840103, |
|
"learning_rate": 9.590351014746059e-06, |
|
"loss": 0.5293, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.31901840490797545, |
|
"grad_norm": 0.08688578754663467, |
|
"learning_rate": 9.570419056770174e-06, |
|
"loss": 0.4923, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.32515337423312884, |
|
"grad_norm": 0.08014731109142303, |
|
"learning_rate": 9.550035369739416e-06, |
|
"loss": 0.4639, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3312883435582822, |
|
"grad_norm": 0.08636078238487244, |
|
"learning_rate": 9.529201968327618e-06, |
|
"loss": 0.3038, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3374233128834356, |
|
"grad_norm": 0.06179236248135567, |
|
"learning_rate": 9.50792091165728e-06, |
|
"loss": 0.3302, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.34355828220858897, |
|
"grad_norm": 0.06599953025579453, |
|
"learning_rate": 9.486194303096062e-06, |
|
"loss": 0.2632, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3496932515337423, |
|
"grad_norm": 0.06800541281700134, |
|
"learning_rate": 9.464024290048879e-06, |
|
"loss": 0.4134, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3558282208588957, |
|
"grad_norm": 0.07703054696321487, |
|
"learning_rate": 9.44141306374566e-06, |
|
"loss": 0.4175, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3619631901840491, |
|
"grad_norm": 0.06939573585987091, |
|
"learning_rate": 9.418362859024781e-06, |
|
"loss": 0.3076, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.36809815950920244, |
|
"grad_norm": 0.07432107627391815, |
|
"learning_rate": 9.39487595411217e-06, |
|
"loss": 0.4667, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.37423312883435583, |
|
"grad_norm": 0.08580990135669708, |
|
"learning_rate": 9.37095467039613e-06, |
|
"loss": 0.4878, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3803680981595092, |
|
"grad_norm": 0.07800393551588058, |
|
"learning_rate": 9.346601372197914e-06, |
|
"loss": 0.4484, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.38650306748466257, |
|
"grad_norm": 0.08790121227502823, |
|
"learning_rate": 9.32181846653802e-06, |
|
"loss": 0.5667, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.39263803680981596, |
|
"grad_norm": 0.07062779366970062, |
|
"learning_rate": 9.296608402898306e-06, |
|
"loss": 0.3125, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3987730061349693, |
|
"grad_norm": 0.08514374494552612, |
|
"learning_rate": 9.270973672979877e-06, |
|
"loss": 0.5389, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4049079754601227, |
|
"grad_norm": 0.09363362193107605, |
|
"learning_rate": 9.244916810456822e-06, |
|
"loss": 0.4613, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4110429447852761, |
|
"grad_norm": 0.08358146995306015, |
|
"learning_rate": 9.218440390725772e-06, |
|
"loss": 0.422, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4171779141104294, |
|
"grad_norm": 0.06681355088949203, |
|
"learning_rate": 9.191547030651383e-06, |
|
"loss": 0.4099, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4233128834355828, |
|
"grad_norm": 0.07471272349357605, |
|
"learning_rate": 9.164239388307668e-06, |
|
"loss": 0.3876, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4294478527607362, |
|
"grad_norm": 0.08724360167980194, |
|
"learning_rate": 9.136520162715288e-06, |
|
"loss": 0.493, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.43558282208588955, |
|
"grad_norm": 0.06159456446766853, |
|
"learning_rate": 9.108392093574785e-06, |
|
"loss": 0.2673, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.44171779141104295, |
|
"grad_norm": 0.0887589380145073, |
|
"learning_rate": 9.079857960995806e-06, |
|
"loss": 0.4282, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.44785276073619634, |
|
"grad_norm": 0.08436176180839539, |
|
"learning_rate": 9.050920585222309e-06, |
|
"loss": 0.503, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4539877300613497, |
|
"grad_norm": 0.08223728090524673, |
|
"learning_rate": 9.021582826353825e-06, |
|
"loss": 0.5367, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4601226993865031, |
|
"grad_norm": 0.07190907746553421, |
|
"learning_rate": 8.991847584062776e-06, |
|
"loss": 0.4085, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4662576687116564, |
|
"grad_norm": 0.0850868821144104, |
|
"learning_rate": 8.961717797307872e-06, |
|
"loss": 0.4974, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4723926380368098, |
|
"grad_norm": 0.1070251539349556, |
|
"learning_rate": 8.931196444043635e-06, |
|
"loss": 0.5619, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4785276073619632, |
|
"grad_norm": 0.07099828869104385, |
|
"learning_rate": 8.900286540926062e-06, |
|
"loss": 0.431, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.48466257668711654, |
|
"grad_norm": 0.06902176886796951, |
|
"learning_rate": 8.868991143014469e-06, |
|
"loss": 0.3375, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.49079754601226994, |
|
"grad_norm": 0.11233729869127274, |
|
"learning_rate": 8.83731334346954e-06, |
|
"loss": 0.5159, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.49693251533742333, |
|
"grad_norm": 0.07816348224878311, |
|
"learning_rate": 8.805256273247597e-06, |
|
"loss": 0.4657, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5030674846625767, |
|
"grad_norm": 0.08207116276025772, |
|
"learning_rate": 8.772823100791152e-06, |
|
"loss": 0.5891, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5030674846625767, |
|
"eval_loss": 0.4219532012939453, |
|
"eval_runtime": 52.8567, |
|
"eval_samples_per_second": 3.481, |
|
"eval_steps_per_second": 0.435, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.50920245398773, |
|
"grad_norm": 0.1063007116317749, |
|
"learning_rate": 8.74001703171574e-06, |
|
"loss": 0.4719, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5153374233128835, |
|
"grad_norm": 0.06525219976902008, |
|
"learning_rate": 8.706841308493092e-06, |
|
"loss": 0.2945, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5214723926380368, |
|
"grad_norm": 0.084320567548275, |
|
"learning_rate": 8.673299210130647e-06, |
|
"loss": 0.5193, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5276073619631901, |
|
"grad_norm": 0.09182199090719223, |
|
"learning_rate": 8.639394051847472e-06, |
|
"loss": 0.4065, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5337423312883436, |
|
"grad_norm": 0.08341530710458755, |
|
"learning_rate": 8.605129184746586e-06, |
|
"loss": 0.5126, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5398773006134969, |
|
"grad_norm": 0.0803583562374115, |
|
"learning_rate": 8.57050799548375e-06, |
|
"loss": 0.4048, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5460122699386503, |
|
"grad_norm": 0.10489048063755035, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.4255, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5521472392638037, |
|
"grad_norm": 0.09766017645597458, |
|
"learning_rate": 8.500210372847128e-06, |
|
"loss": 0.5091, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.558282208588957, |
|
"grad_norm": 0.06364760547876358, |
|
"learning_rate": 8.464540887518638e-06, |
|
"loss": 0.2591, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5644171779141104, |
|
"grad_norm": 0.07464785873889923, |
|
"learning_rate": 8.428528975432067e-06, |
|
"loss": 0.3407, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5705521472392638, |
|
"grad_norm": 0.08410008996725082, |
|
"learning_rate": 8.392178195916832e-06, |
|
"loss": 0.4741, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5766871165644172, |
|
"grad_norm": 0.08058732002973557, |
|
"learning_rate": 8.355492141795185e-06, |
|
"loss": 0.373, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5828220858895705, |
|
"grad_norm": 0.10042490810155869, |
|
"learning_rate": 8.318474439027096e-06, |
|
"loss": 0.4643, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.588957055214724, |
|
"grad_norm": 0.06268122047185898, |
|
"learning_rate": 8.281128746351878e-06, |
|
"loss": 0.3071, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5950920245398773, |
|
"grad_norm": 0.08433736115694046, |
|
"learning_rate": 8.24345875492657e-06, |
|
"loss": 0.4042, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6012269938650306, |
|
"grad_norm": 0.10664895176887512, |
|
"learning_rate": 8.2054681879611e-06, |
|
"loss": 0.5345, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6073619631901841, |
|
"grad_norm": 0.08828828483819962, |
|
"learning_rate": 8.167160800350306e-06, |
|
"loss": 0.4128, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6134969325153374, |
|
"grad_norm": 0.09366817027330399, |
|
"learning_rate": 8.1285403783028e-06, |
|
"loss": 0.6111, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6196319018404908, |
|
"grad_norm": 0.07751886546611786, |
|
"learning_rate": 8.089610738966754e-06, |
|
"loss": 0.4008, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6257668711656442, |
|
"grad_norm": 0.09029467403888702, |
|
"learning_rate": 8.050375730052622e-06, |
|
"loss": 0.5109, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6319018404907976, |
|
"grad_norm": 0.09237638860940933, |
|
"learning_rate": 8.010839229452843e-06, |
|
"loss": 0.3751, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6380368098159509, |
|
"grad_norm": 0.09511607140302658, |
|
"learning_rate": 7.971005144858554e-06, |
|
"loss": 0.3808, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6441717791411042, |
|
"grad_norm": 0.07342271506786346, |
|
"learning_rate": 7.930877413373369e-06, |
|
"loss": 0.3106, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6503067484662577, |
|
"grad_norm": 0.07932678610086441, |
|
"learning_rate": 7.890460001124242e-06, |
|
"loss": 0.3932, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.656441717791411, |
|
"grad_norm": 0.07940706610679626, |
|
"learning_rate": 7.849756902869471e-06, |
|
"loss": 0.4068, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6625766871165644, |
|
"grad_norm": 0.09030912816524506, |
|
"learning_rate": 7.808772141603855e-06, |
|
"loss": 0.431, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6687116564417178, |
|
"grad_norm": 0.091177798807621, |
|
"learning_rate": 7.767509768161079e-06, |
|
"loss": 0.4878, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6748466257668712, |
|
"grad_norm": 0.09804334491491318, |
|
"learning_rate": 7.725973860813338e-06, |
|
"loss": 0.4954, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6809815950920245, |
|
"grad_norm": 0.06426946073770523, |
|
"learning_rate": 7.684168524868253e-06, |
|
"loss": 0.2363, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6871165644171779, |
|
"grad_norm": 0.06771596521139145, |
|
"learning_rate": 7.642097892263098e-06, |
|
"loss": 0.3587, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6932515337423313, |
|
"grad_norm": 0.08947895467281342, |
|
"learning_rate": 7.599766121156436e-06, |
|
"loss": 0.3753, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6993865030674846, |
|
"grad_norm": 0.06724036484956741, |
|
"learning_rate": 7.5571773955171124e-06, |
|
"loss": 0.3099, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7055214723926381, |
|
"grad_norm": 0.08312314003705978, |
|
"learning_rate": 7.5143359247107314e-06, |
|
"loss": 0.3134, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7116564417177914, |
|
"grad_norm": 0.0804123803973198, |
|
"learning_rate": 7.471245943083615e-06, |
|
"loss": 0.3077, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7177914110429447, |
|
"grad_norm": 0.08391325175762177, |
|
"learning_rate": 7.427911709544288e-06, |
|
"loss": 0.416, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7239263803680982, |
|
"grad_norm": 0.067812480032444, |
|
"learning_rate": 7.3843375071425315e-06, |
|
"loss": 0.2535, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7300613496932515, |
|
"grad_norm": 0.103757344186306, |
|
"learning_rate": 7.340527642646069e-06, |
|
"loss": 0.5859, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7361963190184049, |
|
"grad_norm": 0.07385145872831345, |
|
"learning_rate": 7.2964864461148895e-06, |
|
"loss": 0.3527, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7423312883435583, |
|
"grad_norm": 0.08877512067556381, |
|
"learning_rate": 7.252218270473274e-06, |
|
"loss": 0.5138, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7484662576687117, |
|
"grad_norm": 0.06157367303967476, |
|
"learning_rate": 7.2077274910795605e-06, |
|
"loss": 0.2458, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.754601226993865, |
|
"grad_norm": 0.17377746105194092, |
|
"learning_rate": 7.163018505293703e-06, |
|
"loss": 0.3547, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.754601226993865, |
|
"eval_loss": 0.40320152044296265, |
|
"eval_runtime": 22.6808, |
|
"eval_samples_per_second": 8.113, |
|
"eval_steps_per_second": 1.014, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7607361963190185, |
|
"grad_norm": 0.08620224893093109, |
|
"learning_rate": 7.118095732042643e-06, |
|
"loss": 0.5084, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7668711656441718, |
|
"grad_norm": 0.07054693251848221, |
|
"learning_rate": 7.072963611383545e-06, |
|
"loss": 0.2391, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7730061349693251, |
|
"grad_norm": 0.1339060664176941, |
|
"learning_rate": 7.02762660406497e-06, |
|
"loss": 0.6351, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7791411042944786, |
|
"grad_norm": 0.07386178523302078, |
|
"learning_rate": 6.982089191085971e-06, |
|
"loss": 0.3047, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7852760736196319, |
|
"grad_norm": 0.10029296576976776, |
|
"learning_rate": 6.936355873253207e-06, |
|
"loss": 0.4328, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7914110429447853, |
|
"grad_norm": 0.09643431752920151, |
|
"learning_rate": 6.8904311707360914e-06, |
|
"loss": 0.4234, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7975460122699386, |
|
"grad_norm": 0.08629649877548218, |
|
"learning_rate": 6.844319622620039e-06, |
|
"loss": 0.3454, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.803680981595092, |
|
"grad_norm": 0.09917541593313217, |
|
"learning_rate": 6.798025786457825e-06, |
|
"loss": 0.4828, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8098159509202454, |
|
"grad_norm": 0.08652715384960175, |
|
"learning_rate": 6.751554237819122e-06, |
|
"loss": 0.3921, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8159509202453987, |
|
"grad_norm": 0.08848355710506439, |
|
"learning_rate": 6.704909569838281e-06, |
|
"loss": 0.3573, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8220858895705522, |
|
"grad_norm": 0.09000125527381897, |
|
"learning_rate": 6.65809639276034e-06, |
|
"loss": 0.4486, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8282208588957055, |
|
"grad_norm": 0.08975492417812347, |
|
"learning_rate": 6.611119333485364e-06, |
|
"loss": 0.3867, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8343558282208589, |
|
"grad_norm": 0.0796804279088974, |
|
"learning_rate": 6.563983035111136e-06, |
|
"loss": 0.381, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8404907975460123, |
|
"grad_norm": 0.08723866939544678, |
|
"learning_rate": 6.516692156474243e-06, |
|
"loss": 0.4316, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8466257668711656, |
|
"grad_norm": 0.11449507623910904, |
|
"learning_rate": 6.469251371689606e-06, |
|
"loss": 0.5042, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.852760736196319, |
|
"grad_norm": 0.127399280667305, |
|
"learning_rate": 6.421665369688501e-06, |
|
"loss": 0.56, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8588957055214724, |
|
"grad_norm": 0.0761680155992508, |
|
"learning_rate": 6.373938853755126e-06, |
|
"loss": 0.2881, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8650306748466258, |
|
"grad_norm": 0.08487329632043839, |
|
"learning_rate": 6.326076541061729e-06, |
|
"loss": 0.3171, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8711656441717791, |
|
"grad_norm": 0.09721650928258896, |
|
"learning_rate": 6.278083162202374e-06, |
|
"loss": 0.4289, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8773006134969326, |
|
"grad_norm": 0.08022255450487137, |
|
"learning_rate": 6.22996346072539e-06, |
|
"loss": 0.327, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8834355828220859, |
|
"grad_norm": 0.11367864906787872, |
|
"learning_rate": 6.181722192664526e-06, |
|
"loss": 0.4132, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8895705521472392, |
|
"grad_norm": 0.08727509528398514, |
|
"learning_rate": 6.133364126068867e-06, |
|
"loss": 0.3467, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8957055214723927, |
|
"grad_norm": 0.09527825564146042, |
|
"learning_rate": 6.084894040531591e-06, |
|
"loss": 0.5279, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.901840490797546, |
|
"grad_norm": 0.07519163191318512, |
|
"learning_rate": 6.036316726717546e-06, |
|
"loss": 0.3221, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9079754601226994, |
|
"grad_norm": 0.08815211802721024, |
|
"learning_rate": 5.987636985889764e-06, |
|
"loss": 0.347, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9141104294478528, |
|
"grad_norm": 0.10302853584289551, |
|
"learning_rate": 5.938859629434913e-06, |
|
"loss": 0.4378, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9202453987730062, |
|
"grad_norm": 0.09466961026191711, |
|
"learning_rate": 5.8899894783877536e-06, |
|
"loss": 0.3053, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9263803680981595, |
|
"grad_norm": 0.09226138889789581, |
|
"learning_rate": 5.841031362954629e-06, |
|
"loss": 0.4522, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9325153374233128, |
|
"grad_norm": 0.10703295469284058, |
|
"learning_rate": 5.791990122036075e-06, |
|
"loss": 0.4592, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9386503067484663, |
|
"grad_norm": 0.1230417788028717, |
|
"learning_rate": 5.742870602748547e-06, |
|
"loss": 0.5402, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9447852760736196, |
|
"grad_norm": 0.09710447490215302, |
|
"learning_rate": 5.693677659945343e-06, |
|
"loss": 0.4628, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.950920245398773, |
|
"grad_norm": 0.11172331869602203, |
|
"learning_rate": 5.6444161557367534e-06, |
|
"loss": 0.3569, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9570552147239264, |
|
"grad_norm": 0.09823194146156311, |
|
"learning_rate": 5.595090959009525e-06, |
|
"loss": 0.4716, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9631901840490797, |
|
"grad_norm": 0.08101484179496765, |
|
"learning_rate": 5.5457069449456055e-06, |
|
"loss": 0.4065, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9693251533742331, |
|
"grad_norm": 0.08857468515634537, |
|
"learning_rate": 5.496268994540309e-06, |
|
"loss": 0.3606, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9754601226993865, |
|
"grad_norm": 0.07157395780086517, |
|
"learning_rate": 5.446781994119886e-06, |
|
"loss": 0.2453, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.9815950920245399, |
|
"grad_norm": 0.07760554552078247, |
|
"learning_rate": 5.397250834858573e-06, |
|
"loss": 0.364, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9877300613496932, |
|
"grad_norm": 0.07792805880308151, |
|
"learning_rate": 5.347680412295152e-06, |
|
"loss": 0.3496, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9938650306748467, |
|
"grad_norm": 0.08258286118507385, |
|
"learning_rate": 5.2980756258491e-06, |
|
"loss": 0.3455, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.11803517490625381, |
|
"learning_rate": 5.2484413783363335e-06, |
|
"loss": 0.4871, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0061349693251533, |
|
"grad_norm": 0.10544019937515259, |
|
"learning_rate": 5.19878257548463e-06, |
|
"loss": 0.4369, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0061349693251533, |
|
"eval_loss": 0.39176106452941895, |
|
"eval_runtime": 22.8177, |
|
"eval_samples_per_second": 8.064, |
|
"eval_steps_per_second": 1.008, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0122699386503067, |
|
"grad_norm": 0.09844586253166199, |
|
"learning_rate": 5.149104125448752e-06, |
|
"loss": 0.4385, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.01840490797546, |
|
"grad_norm": 0.07265810668468475, |
|
"learning_rate": 5.099410938325351e-06, |
|
"loss": 0.2595, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.0245398773006136, |
|
"grad_norm": 0.12025801837444305, |
|
"learning_rate": 5.04970792566765e-06, |
|
"loss": 0.5587, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.030674846625767, |
|
"grad_norm": 0.1306670904159546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.603, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0368098159509203, |
|
"grad_norm": 0.10355333238840103, |
|
"learning_rate": 4.9502920743323525e-06, |
|
"loss": 0.3555, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0061349693251533, |
|
"grad_norm": 0.10698702186346054, |
|
"learning_rate": 4.900589061674649e-06, |
|
"loss": 0.4452, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0122699386503067, |
|
"grad_norm": 0.0719093456864357, |
|
"learning_rate": 4.850895874551248e-06, |
|
"loss": 0.3209, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.01840490797546, |
|
"grad_norm": 0.07391523569822311, |
|
"learning_rate": 4.801217424515373e-06, |
|
"loss": 0.2961, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.0245398773006136, |
|
"grad_norm": 0.07925672829151154, |
|
"learning_rate": 4.751558621663668e-06, |
|
"loss": 0.3446, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.030674846625767, |
|
"grad_norm": 0.10210183262825012, |
|
"learning_rate": 4.701924374150901e-06, |
|
"loss": 0.49, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.0368098159509203, |
|
"grad_norm": 0.11506979912519455, |
|
"learning_rate": 4.6523195877048495e-06, |
|
"loss": 0.2977, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.0429447852760736, |
|
"grad_norm": 0.09373599290847778, |
|
"learning_rate": 4.602749165141429e-06, |
|
"loss": 0.3853, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.049079754601227, |
|
"grad_norm": 0.09830620139837265, |
|
"learning_rate": 4.5532180058801145e-06, |
|
"loss": 0.4363, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.0552147239263803, |
|
"grad_norm": 0.08626256883144379, |
|
"learning_rate": 4.5037310054596936e-06, |
|
"loss": 0.4265, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.0613496932515338, |
|
"grad_norm": 0.11579606682062149, |
|
"learning_rate": 4.454293055054397e-06, |
|
"loss": 0.5565, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.0674846625766872, |
|
"grad_norm": 0.10661870241165161, |
|
"learning_rate": 4.404909040990477e-06, |
|
"loss": 0.5136, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0736196319018405, |
|
"grad_norm": 0.09875867515802383, |
|
"learning_rate": 4.355583844263247e-06, |
|
"loss": 0.5337, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.0797546012269938, |
|
"grad_norm": 0.2183881402015686, |
|
"learning_rate": 4.30632234005466e-06, |
|
"loss": 0.3483, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.0858895705521472, |
|
"grad_norm": 0.0771089717745781, |
|
"learning_rate": 4.257129397251453e-06, |
|
"loss": 0.3027, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.0920245398773005, |
|
"grad_norm": 0.10682545602321625, |
|
"learning_rate": 4.2080098779639255e-06, |
|
"loss": 0.4408, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.098159509202454, |
|
"grad_norm": 0.08865021914243698, |
|
"learning_rate": 4.158968637045374e-06, |
|
"loss": 0.4054, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.1042944785276074, |
|
"grad_norm": 0.11121812462806702, |
|
"learning_rate": 4.11001052161225e-06, |
|
"loss": 0.3993, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.1104294478527608, |
|
"grad_norm": 0.10253646969795227, |
|
"learning_rate": 4.061140370565088e-06, |
|
"loss": 0.4388, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.116564417177914, |
|
"grad_norm": 0.09401492774486542, |
|
"learning_rate": 4.012363014110237e-06, |
|
"loss": 0.4846, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.1226993865030674, |
|
"grad_norm": 0.1021651104092598, |
|
"learning_rate": 3.9636832732824555e-06, |
|
"loss": 0.312, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.1288343558282208, |
|
"grad_norm": 0.12308547645807266, |
|
"learning_rate": 3.91510595946841e-06, |
|
"loss": 0.5238, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1349693251533743, |
|
"grad_norm": 0.1113848015666008, |
|
"learning_rate": 3.866635873931133e-06, |
|
"loss": 0.4818, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.1411042944785277, |
|
"grad_norm": 0.10525127500295639, |
|
"learning_rate": 3.818277807335477e-06, |
|
"loss": 0.381, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.147239263803681, |
|
"grad_norm": 0.12201809883117676, |
|
"learning_rate": 3.7700365392746106e-06, |
|
"loss": 0.3412, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.1533742331288344, |
|
"grad_norm": 0.08116313070058823, |
|
"learning_rate": 3.721916837797627e-06, |
|
"loss": 0.3486, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.1595092024539877, |
|
"grad_norm": 0.11550577729940414, |
|
"learning_rate": 3.6739234589382722e-06, |
|
"loss": 0.3337, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.165644171779141, |
|
"grad_norm": 0.08885617554187775, |
|
"learning_rate": 3.6260611462448736e-06, |
|
"loss": 0.27, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.1717791411042944, |
|
"grad_norm": 0.1086370199918747, |
|
"learning_rate": 3.5783346303114986e-06, |
|
"loss": 0.3044, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.177914110429448, |
|
"grad_norm": 0.10726229846477509, |
|
"learning_rate": 3.5307486283103966e-06, |
|
"loss": 0.4503, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.1840490797546013, |
|
"grad_norm": 0.0863179937005043, |
|
"learning_rate": 3.4833078435257584e-06, |
|
"loss": 0.2563, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.1901840490797546, |
|
"grad_norm": 0.14450977742671967, |
|
"learning_rate": 3.4360169648888653e-06, |
|
"loss": 0.5502, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.196319018404908, |
|
"grad_norm": 0.10789606720209122, |
|
"learning_rate": 3.388880666514637e-06, |
|
"loss": 0.3767, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.2024539877300613, |
|
"grad_norm": 0.11021628230810165, |
|
"learning_rate": 3.3419036072396614e-06, |
|
"loss": 0.4526, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.2085889570552146, |
|
"grad_norm": 0.10493913292884827, |
|
"learning_rate": 3.29509043016172e-06, |
|
"loss": 0.3998, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.2147239263803682, |
|
"grad_norm": 0.12165309488773346, |
|
"learning_rate": 3.2484457621808787e-06, |
|
"loss": 0.383, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.2208588957055215, |
|
"grad_norm": 0.09394080936908722, |
|
"learning_rate": 3.201974213542178e-06, |
|
"loss": 0.4129, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.2208588957055215, |
|
"eval_loss": 0.38451722264289856, |
|
"eval_runtime": 22.6218, |
|
"eval_samples_per_second": 8.134, |
|
"eval_steps_per_second": 1.017, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.2269938650306749, |
|
"grad_norm": 0.09802668541669846, |
|
"learning_rate": 3.1556803773799616e-06, |
|
"loss": 0.408, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.2331288343558282, |
|
"grad_norm": 0.13217565417289734, |
|
"learning_rate": 3.1095688292639094e-06, |
|
"loss": 0.3198, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.2392638036809815, |
|
"grad_norm": 0.10313939303159714, |
|
"learning_rate": 3.0636441267467955e-06, |
|
"loss": 0.3128, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.2453987730061349, |
|
"grad_norm": 0.0762137621641159, |
|
"learning_rate": 3.01791080891403e-06, |
|
"loss": 0.2638, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.2515337423312882, |
|
"grad_norm": 0.11147941648960114, |
|
"learning_rate": 2.972373395935031e-06, |
|
"loss": 0.4534, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2576687116564418, |
|
"grad_norm": 0.09862508624792099, |
|
"learning_rate": 2.927036388616457e-06, |
|
"loss": 0.4292, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.2638036809815951, |
|
"grad_norm": 0.09868728369474411, |
|
"learning_rate": 2.8819042679573618e-06, |
|
"loss": 0.3276, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.2699386503067485, |
|
"grad_norm": 0.08501134812831879, |
|
"learning_rate": 2.8369814947062994e-06, |
|
"loss": 0.3095, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.2760736196319018, |
|
"grad_norm": 0.11438091844320297, |
|
"learning_rate": 2.792272508920443e-06, |
|
"loss": 0.295, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.2822085889570551, |
|
"grad_norm": 0.11010921746492386, |
|
"learning_rate": 2.7477817295267273e-06, |
|
"loss": 0.433, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.2883435582822087, |
|
"grad_norm": 0.08820465952157974, |
|
"learning_rate": 2.70351355388511e-06, |
|
"loss": 0.3045, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.294478527607362, |
|
"grad_norm": 0.08259084820747375, |
|
"learning_rate": 2.6594723573539307e-06, |
|
"loss": 0.3812, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.3006134969325154, |
|
"grad_norm": 0.11154909431934357, |
|
"learning_rate": 2.615662492857471e-06, |
|
"loss": 0.3513, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.3067484662576687, |
|
"grad_norm": 0.0986664667725563, |
|
"learning_rate": 2.5720882904557156e-06, |
|
"loss": 0.3658, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.312883435582822, |
|
"grad_norm": 0.2315550446510315, |
|
"learning_rate": 2.528754056916386e-06, |
|
"loss": 0.5219, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3190184049079754, |
|
"grad_norm": 0.10897372663021088, |
|
"learning_rate": 2.4856640752892702e-06, |
|
"loss": 0.4178, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.3251533742331287, |
|
"grad_norm": 0.14039960503578186, |
|
"learning_rate": 2.4428226044828896e-06, |
|
"loss": 0.4113, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.331288343558282, |
|
"grad_norm": 0.10824128240346909, |
|
"learning_rate": 2.4002338788435654e-06, |
|
"loss": 0.5079, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.3374233128834356, |
|
"grad_norm": 0.10082822293043137, |
|
"learning_rate": 2.3579021077369047e-06, |
|
"loss": 0.3356, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.343558282208589, |
|
"grad_norm": 0.1202516257762909, |
|
"learning_rate": 2.315831475131751e-06, |
|
"loss": 0.3236, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.3496932515337423, |
|
"grad_norm": 0.1029471606016159, |
|
"learning_rate": 2.2740261391866634e-06, |
|
"loss": 0.3491, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.3558282208588956, |
|
"grad_norm": 0.12975585460662842, |
|
"learning_rate": 2.232490231838923e-06, |
|
"loss": 0.402, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.3619631901840492, |
|
"grad_norm": 0.12482481449842453, |
|
"learning_rate": 2.1912278583961454e-06, |
|
"loss": 0.4321, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.3680981595092025, |
|
"grad_norm": 0.1507352739572525, |
|
"learning_rate": 2.1502430971305288e-06, |
|
"loss": 0.5751, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.3742331288343559, |
|
"grad_norm": 0.0954100489616394, |
|
"learning_rate": 2.1095399988757574e-06, |
|
"loss": 0.3933, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3803680981595092, |
|
"grad_norm": 0.10602657496929169, |
|
"learning_rate": 2.0691225866266335e-06, |
|
"loss": 0.4803, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.3865030674846626, |
|
"grad_norm": 0.11936960369348526, |
|
"learning_rate": 2.0289948551414486e-06, |
|
"loss": 0.3983, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.392638036809816, |
|
"grad_norm": 0.08489519357681274, |
|
"learning_rate": 1.989160770547159e-06, |
|
"loss": 0.2552, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.3987730061349692, |
|
"grad_norm": 0.10064487159252167, |
|
"learning_rate": 1.949624269947378e-06, |
|
"loss": 0.4482, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.4049079754601226, |
|
"grad_norm": 0.10288871824741364, |
|
"learning_rate": 1.9103892610332467e-06, |
|
"loss": 0.3787, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.4110429447852761, |
|
"grad_norm": 0.09945371747016907, |
|
"learning_rate": 1.8714596216972008e-06, |
|
"loss": 0.3845, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.4171779141104295, |
|
"grad_norm": 0.11584262549877167, |
|
"learning_rate": 1.8328391996496942e-06, |
|
"loss": 0.5336, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.4233128834355828, |
|
"grad_norm": 0.08862798660993576, |
|
"learning_rate": 1.794531812038901e-06, |
|
"loss": 0.3253, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.4294478527607362, |
|
"grad_norm": 0.12273416668176651, |
|
"learning_rate": 1.756541245073432e-06, |
|
"loss": 0.4336, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.4355828220858895, |
|
"grad_norm": 0.08479359745979309, |
|
"learning_rate": 1.7188712536481233e-06, |
|
"loss": 0.3385, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.441717791411043, |
|
"grad_norm": 0.08785238116979599, |
|
"learning_rate": 1.6815255609729047e-06, |
|
"loss": 0.3856, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.4478527607361964, |
|
"grad_norm": 0.11194564402103424, |
|
"learning_rate": 1.6445078582048158e-06, |
|
"loss": 0.4059, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.4539877300613497, |
|
"grad_norm": 0.10265929996967316, |
|
"learning_rate": 1.6078218040831678e-06, |
|
"loss": 0.5095, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.460122699386503, |
|
"grad_norm": 0.1036793664097786, |
|
"learning_rate": 1.5714710245679348e-06, |
|
"loss": 0.3271, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.4662576687116564, |
|
"grad_norm": 0.16661570966243744, |
|
"learning_rate": 1.5354591124813628e-06, |
|
"loss": 0.4891, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.4723926380368098, |
|
"grad_norm": 0.08812650293111801, |
|
"learning_rate": 1.499789627152874e-06, |
|
"loss": 0.335, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.4723926380368098, |
|
"eval_loss": 0.3808572590351105, |
|
"eval_runtime": 22.6328, |
|
"eval_samples_per_second": 8.13, |
|
"eval_steps_per_second": 1.016, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.478527607361963, |
|
"grad_norm": 0.1271064132452011, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 0.4646, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.4846625766871164, |
|
"grad_norm": 0.10755060613155365, |
|
"learning_rate": 1.4294920045162514e-06, |
|
"loss": 0.4369, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.49079754601227, |
|
"grad_norm": 0.10873299837112427, |
|
"learning_rate": 1.3948708152534163e-06, |
|
"loss": 0.3521, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.4969325153374233, |
|
"grad_norm": 0.15240046381950378, |
|
"learning_rate": 1.3606059481525296e-06, |
|
"loss": 0.4, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5030674846625767, |
|
"grad_norm": 0.158911794424057, |
|
"learning_rate": 1.3267007898693552e-06, |
|
"loss": 0.3103, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.50920245398773, |
|
"grad_norm": 0.11236365884542465, |
|
"learning_rate": 1.2931586915069106e-06, |
|
"loss": 0.3715, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.5153374233128836, |
|
"grad_norm": 0.1483219861984253, |
|
"learning_rate": 1.2599829682842618e-06, |
|
"loss": 0.4046, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.521472392638037, |
|
"grad_norm": 0.09588169306516647, |
|
"learning_rate": 1.227176899208849e-06, |
|
"loss": 0.3077, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.5276073619631902, |
|
"grad_norm": 0.10631563514471054, |
|
"learning_rate": 1.194743726752403e-06, |
|
"loss": 0.4418, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.5337423312883436, |
|
"grad_norm": 0.10569003224372864, |
|
"learning_rate": 1.1626866565304594e-06, |
|
"loss": 0.4616, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.539877300613497, |
|
"grad_norm": 0.11209447681903839, |
|
"learning_rate": 1.1310088569855315e-06, |
|
"loss": 0.4556, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.5460122699386503, |
|
"grad_norm": 0.14903852343559265, |
|
"learning_rate": 1.09971345907394e-06, |
|
"loss": 0.3381, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.5521472392638036, |
|
"grad_norm": 0.10578689724206924, |
|
"learning_rate": 1.068803555956367e-06, |
|
"loss": 0.3798, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.558282208588957, |
|
"grad_norm": 0.12829791009426117, |
|
"learning_rate": 1.0382822026921291e-06, |
|
"loss": 0.5217, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5644171779141103, |
|
"grad_norm": 0.12076660990715027, |
|
"learning_rate": 1.0081524159372246e-06, |
|
"loss": 0.4557, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.5705521472392638, |
|
"grad_norm": 0.11094032227993011, |
|
"learning_rate": 9.784171736461762e-07, |
|
"loss": 0.3339, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.5766871165644172, |
|
"grad_norm": 0.09728420525789261, |
|
"learning_rate": 9.490794147776927e-07, |
|
"loss": 0.3506, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.5828220858895705, |
|
"grad_norm": 0.1307501345872879, |
|
"learning_rate": 9.201420390041965e-07, |
|
"loss": 0.3652, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.588957055214724, |
|
"grad_norm": 0.11151021718978882, |
|
"learning_rate": 8.916079064252164e-07, |
|
"loss": 0.3796, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.5950920245398774, |
|
"grad_norm": 0.091631218791008, |
|
"learning_rate": 8.634798372847148e-07, |
|
"loss": 0.2796, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.6012269938650308, |
|
"grad_norm": 0.10448212176561356, |
|
"learning_rate": 8.357606116923328e-07, |
|
"loss": 0.2626, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.607361963190184, |
|
"grad_norm": 0.11236970871686935, |
|
"learning_rate": 8.084529693486171e-07, |
|
"loss": 0.3601, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.6134969325153374, |
|
"grad_norm": 0.11304070055484772, |
|
"learning_rate": 7.815596092742278e-07, |
|
"loss": 0.3641, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.6196319018404908, |
|
"grad_norm": 0.4070085883140564, |
|
"learning_rate": 7.550831895431799e-07, |
|
"loss": 0.4316, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6257668711656441, |
|
"grad_norm": 0.10678707808256149, |
|
"learning_rate": 7.290263270201231e-07, |
|
"loss": 0.4281, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.6319018404907975, |
|
"grad_norm": 0.10431778430938721, |
|
"learning_rate": 7.033915971016952e-07, |
|
"loss": 0.3907, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.6380368098159508, |
|
"grad_norm": 0.10240163654088974, |
|
"learning_rate": 6.781815334619812e-07, |
|
"loss": 0.3344, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.6441717791411041, |
|
"grad_norm": 0.10372646898031235, |
|
"learning_rate": 6.533986278020876e-07, |
|
"loss": 0.373, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.6503067484662577, |
|
"grad_norm": 0.12202878296375275, |
|
"learning_rate": 6.290453296038702e-07, |
|
"loss": 0.4087, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.656441717791411, |
|
"grad_norm": 0.09939797222614288, |
|
"learning_rate": 6.051240458878316e-07, |
|
"loss": 0.3611, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.6625766871165644, |
|
"grad_norm": 0.12352439016103745, |
|
"learning_rate": 5.816371409752203e-07, |
|
"loss": 0.5242, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.668711656441718, |
|
"grad_norm": 0.11458908766508102, |
|
"learning_rate": 5.585869362543416e-07, |
|
"loss": 0.4827, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.6748466257668713, |
|
"grad_norm": 0.14145226776599884, |
|
"learning_rate": 5.359757099511237e-07, |
|
"loss": 0.3911, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.6809815950920246, |
|
"grad_norm": 0.09614825993776321, |
|
"learning_rate": 5.138056969039384e-07, |
|
"loss": 0.3729, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.687116564417178, |
|
"grad_norm": 0.10174663364887238, |
|
"learning_rate": 4.920790883427201e-07, |
|
"loss": 0.4666, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.6932515337423313, |
|
"grad_norm": 0.09838556498289108, |
|
"learning_rate": 4.707980316723837e-07, |
|
"loss": 0.4527, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.6993865030674846, |
|
"grad_norm": 0.09283680468797684, |
|
"learning_rate": 4.4996463026058476e-07, |
|
"loss": 0.3644, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.705521472392638, |
|
"grad_norm": 0.1122664213180542, |
|
"learning_rate": 4.2958094322982703e-07, |
|
"loss": 0.344, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.7116564417177913, |
|
"grad_norm": 0.15890415012836456, |
|
"learning_rate": 4.096489852539426e-07, |
|
"loss": 0.647, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.7177914110429446, |
|
"grad_norm": 0.13360188901424408, |
|
"learning_rate": 3.9017072635896716e-07, |
|
"loss": 0.5654, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.7239263803680982, |
|
"grad_norm": 0.10913598537445068, |
|
"learning_rate": 3.7114809172842827e-07, |
|
"loss": 0.3842, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.7239263803680982, |
|
"eval_loss": 0.379504919052124, |
|
"eval_runtime": 22.6056, |
|
"eval_samples_per_second": 8.14, |
|
"eval_steps_per_second": 1.017, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.7300613496932515, |
|
"grad_norm": 0.09505796432495117, |
|
"learning_rate": 3.5258296151306495e-07, |
|
"loss": 0.2472, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.7361963190184049, |
|
"grad_norm": 0.10274416208267212, |
|
"learning_rate": 3.3447717064499565e-07, |
|
"loss": 0.4665, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.7423312883435584, |
|
"grad_norm": 0.11168382316827774, |
|
"learning_rate": 3.168325086563612e-07, |
|
"loss": 0.4719, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7484662576687118, |
|
"grad_norm": 0.121711865067482, |
|
"learning_rate": 2.996507195024495e-07, |
|
"loss": 0.3143, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.7546012269938651, |
|
"grad_norm": 0.0906308963894844, |
|
"learning_rate": 2.8293350138932805e-07, |
|
"loss": 0.4019, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.7607361963190185, |
|
"grad_norm": 0.12006894499063492, |
|
"learning_rate": 2.666825066059986e-07, |
|
"loss": 0.3327, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.7668711656441718, |
|
"grad_norm": 0.12550675868988037, |
|
"learning_rate": 2.5089934136108665e-07, |
|
"loss": 0.4567, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.7730061349693251, |
|
"grad_norm": 0.11931753158569336, |
|
"learning_rate": 2.3558556562409074e-07, |
|
"loss": 0.5051, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.7791411042944785, |
|
"grad_norm": 0.13427190482616425, |
|
"learning_rate": 2.2074269297119588e-07, |
|
"loss": 0.4541, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.7852760736196318, |
|
"grad_norm": 0.09217509627342224, |
|
"learning_rate": 2.0637219043567636e-07, |
|
"loss": 0.3891, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.7914110429447851, |
|
"grad_norm": 0.11418317258358002, |
|
"learning_rate": 1.9247547836289792e-07, |
|
"loss": 0.3991, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.7975460122699385, |
|
"grad_norm": 0.09878183901309967, |
|
"learning_rate": 1.7905393026993513e-07, |
|
"loss": 0.3447, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.803680981595092, |
|
"grad_norm": 0.10060778260231018, |
|
"learning_rate": 1.6610887270981425e-07, |
|
"loss": 0.3269, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8098159509202454, |
|
"grad_norm": 0.09531107544898987, |
|
"learning_rate": 1.5364158514040328e-07, |
|
"loss": 0.3163, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.8159509202453987, |
|
"grad_norm": 0.11034092307090759, |
|
"learning_rate": 1.4165329979794972e-07, |
|
"loss": 0.349, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.8220858895705523, |
|
"grad_norm": 0.10453791916370392, |
|
"learning_rate": 1.3014520157529244e-07, |
|
"loss": 0.3591, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.8282208588957056, |
|
"grad_norm": 0.11782266944646835, |
|
"learning_rate": 1.1911842790474637e-07, |
|
"loss": 0.4248, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.834355828220859, |
|
"grad_norm": 0.13830620050430298, |
|
"learning_rate": 1.0857406864568488e-07, |
|
"loss": 0.5286, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.8404907975460123, |
|
"grad_norm": 0.08683877438306808, |
|
"learning_rate": 9.851316597681959e-08, |
|
"loss": 0.2167, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.8466257668711656, |
|
"grad_norm": 0.1016291156411171, |
|
"learning_rate": 8.893671429319294e-08, |
|
"loss": 0.3983, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.852760736196319, |
|
"grad_norm": 0.1077284961938858, |
|
"learning_rate": 7.984566010789673e-08, |
|
"loss": 0.3073, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.8588957055214723, |
|
"grad_norm": 0.08103854209184647, |
|
"learning_rate": 7.124090195851807e-08, |
|
"loss": 0.3116, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.8650306748466257, |
|
"grad_norm": 0.11006593704223633, |
|
"learning_rate": 6.31232903183332e-08, |
|
"loss": 0.4318, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.871165644171779, |
|
"grad_norm": 0.09934885799884796, |
|
"learning_rate": 5.549362751224585e-08, |
|
"loss": 0.3649, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.8773006134969326, |
|
"grad_norm": 0.11354987323284149, |
|
"learning_rate": 4.8352667637490694e-08, |
|
"loss": 0.4092, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.883435582822086, |
|
"grad_norm": 0.10538670420646667, |
|
"learning_rate": 4.170111648909736e-08, |
|
"loss": 0.3204, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.8895705521472392, |
|
"grad_norm": 0.11542137712240219, |
|
"learning_rate": 3.553963149013295e-08, |
|
"loss": 0.4189, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.8957055214723928, |
|
"grad_norm": 0.11087686568498611, |
|
"learning_rate": 2.986882162672344e-08, |
|
"loss": 0.4718, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.9018404907975461, |
|
"grad_norm": 0.11411860585212708, |
|
"learning_rate": 2.4689247387862934e-08, |
|
"loss": 0.3454, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.9079754601226995, |
|
"grad_norm": 0.08472780138254166, |
|
"learning_rate": 2.000142071001632e-08, |
|
"loss": 0.2456, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.9141104294478528, |
|
"grad_norm": 0.10252390056848526, |
|
"learning_rate": 1.580580492652084e-08, |
|
"loss": 0.4087, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.9202453987730062, |
|
"grad_norm": 0.12601801753044128, |
|
"learning_rate": 1.2102814721791645e-08, |
|
"loss": 0.4071, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.9263803680981595, |
|
"grad_norm": 0.08735030889511108, |
|
"learning_rate": 8.8928160903351e-09, |
|
"loss": 0.3744, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9325153374233128, |
|
"grad_norm": 0.10152660310268402, |
|
"learning_rate": 6.176126300573848e-09, |
|
"loss": 0.4182, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.9386503067484662, |
|
"grad_norm": 0.11482395976781845, |
|
"learning_rate": 3.953013863490784e-09, |
|
"loss": 0.3801, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.9447852760736195, |
|
"grad_norm": 0.09311243146657944, |
|
"learning_rate": 2.223698506088612e-09, |
|
"loss": 0.2806, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.9509202453987728, |
|
"grad_norm": 0.1334279328584671, |
|
"learning_rate": 9.883511496722176e-10, |
|
"loss": 0.5107, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.9570552147239264, |
|
"grad_norm": 0.09790550917387009, |
|
"learning_rate": 2.470938929571842e-10, |
|
"loss": 0.2982, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.9631901840490797, |
|
"grad_norm": 0.11607832461595535, |
|
"learning_rate": 0.0, |
|
"loss": 0.3971, |
|
"step": 326 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 326, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 82, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8033259439603057e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|