|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.746626686656672, |
|
"eval_steps": 500, |
|
"global_step": 4500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005997001499250375, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 2.553191489361702e-05, |
|
"loss": 1.3604, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01199400299850075, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 5.106382978723404e-05, |
|
"loss": 1.395, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.017991004497751123, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 7.659574468085105e-05, |
|
"loss": 1.2836, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0239880059970015, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00010212765957446807, |
|
"loss": 1.2673, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.029985007496251874, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001276595744680851, |
|
"loss": 1.1774, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.035982008995502246, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0001531914893617021, |
|
"loss": 1.1907, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.041979010494752625, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00017872340425531912, |
|
"loss": 1.1519, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.047976011994003, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00020425531914893615, |
|
"loss": 1.1234, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.053973013493253376, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00022978723404255317, |
|
"loss": 1.1176, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.05997001499250375, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0002553191489361702, |
|
"loss": 1.1349, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06596701649175413, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0002808510638297872, |
|
"loss": 1.0681, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.07196401799100449, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0002999999653501698, |
|
"loss": 1.035, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.07796101949025487, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00029999913375504725, |
|
"loss": 1.0308, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.08395802098950525, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0002999971933724042, |
|
"loss": 1.004, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.08995502248875563, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00029999414421658403, |
|
"loss": 0.974, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.095952023988006, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0002999899863101258, |
|
"loss": 0.9826, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.10194902548725637, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0002999847196837647, |
|
"loss": 0.9721, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.10794602698650675, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.00029997834437643146, |
|
"loss": 0.9758, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.11394302848575712, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00029997086043525195, |
|
"loss": 0.9551, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1199400299850075, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00029996226791554725, |
|
"loss": 0.9514, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12593703148425786, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00029995256688083294, |
|
"loss": 0.971, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.13193403298350825, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002999417574028187, |
|
"loss": 0.9505, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00029992983956140764, |
|
"loss": 0.9274, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.14392803598200898, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00029991681344469605, |
|
"loss": 0.908, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.14992503748125938, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002999026791489724, |
|
"loss": 0.8855, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15592203898050974, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0002998874367787168, |
|
"loss": 0.9112, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1619190404797601, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002998710864466004, |
|
"loss": 0.8654, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.1679160419790105, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00029985362827348406, |
|
"loss": 0.8824, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00029983506238841787, |
|
"loss": 0.8495, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.17991004497751126, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002998153889286402, |
|
"loss": 0.8686, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18590704647676162, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00029979460803957635, |
|
"loss": 0.8391, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.191904047976012, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00029977271987483787, |
|
"loss": 0.8058, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.19790104947526238, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002997497245962213, |
|
"loss": 0.792, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.20389805097451275, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002997256223737066, |
|
"loss": 0.8186, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2098950524737631, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00029970041338545653, |
|
"loss": 0.7942, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2158920539730135, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002996740978178149, |
|
"loss": 0.7686, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.22188905547226387, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00029964667586530533, |
|
"loss": 0.7888, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.22788605697151423, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00029961814773062973, |
|
"loss": 0.7711, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.23388305847076463, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.000299588513624667, |
|
"loss": 0.7903, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.239880059970015, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00029955777376647124, |
|
"loss": 0.7998, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24587706146926536, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00029952592838327014, |
|
"loss": 0.7503, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.2518740629685157, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0002994929777104636, |
|
"loss": 0.7894, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.25787106446776614, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002994589219916216, |
|
"loss": 0.7525, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.2638680659670165, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002994237614784826, |
|
"loss": 0.7787, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.2698650674662669, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00029938749643095176, |
|
"loss": 0.7606, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002993501271170988, |
|
"loss": 0.777, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.2818590704647676, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002993116538131562, |
|
"loss": 0.7596, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.28785607196401797, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.000299272076803517, |
|
"loss": 0.7424, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.2938530734632684, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.000299231396380733, |
|
"loss": 0.7254, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.29985007496251875, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002991896128455121, |
|
"loss": 0.7353, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3058470764617691, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002991467265067165, |
|
"loss": 0.7678, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.3118440779610195, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00029910273768136026, |
|
"loss": 0.7635, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.31784107946026985, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0002990576466946072, |
|
"loss": 0.6941, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.3238380809595202, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002990114538797678, |
|
"loss": 0.7591, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.32983508245877063, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0002989641595782977, |
|
"loss": 0.7628, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.335832083958021, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0002989157641397943, |
|
"loss": 0.7194, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.34182908545727136, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00029886626792199476, |
|
"loss": 0.7298, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00029881567129077315, |
|
"loss": 0.7616, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.3538230884557721, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002987639746201377, |
|
"loss": 0.7108, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.3598200899550225, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00029871117829222816, |
|
"loss": 0.6867, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3658170914542729, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00029865728269731274, |
|
"loss": 0.7453, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.37181409295352325, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0002986022882337856, |
|
"loss": 0.6907, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3778110944527736, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002985461953081635, |
|
"loss": 0.7118, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.383808095952024, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0002984890043350831, |
|
"loss": 0.6886, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.38980509745127434, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002984307157372978, |
|
"loss": 0.7285, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.39580209895052476, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002983713299456745, |
|
"loss": 0.6622, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4017991004497751, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00029831084739919057, |
|
"loss": 0.6718, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.4077961019490255, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002982492685449306, |
|
"loss": 0.6862, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0002981865938380829, |
|
"loss": 0.7048, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.4197901049475262, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002981228237419365, |
|
"loss": 0.7153, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4257871064467766, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002980579587278771, |
|
"loss": 0.7046, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.431784107946027, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00029799199927538455, |
|
"loss": 0.687, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.43778110944527737, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002979249458720284, |
|
"loss": 0.658, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.44377811094452774, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00029785679901346454, |
|
"loss": 0.6552, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.4497751124437781, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00029778755920343186, |
|
"loss": 0.6414, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45577211394302847, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00029771722695374835, |
|
"loss": 0.6696, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.4617691154422789, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00029764580278430694, |
|
"loss": 0.6113, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.46776611694152925, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00029757328722307234, |
|
"loss": 0.6773, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.4737631184407796, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0002974996808060766, |
|
"loss": 0.6691, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.47976011994003, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002974249840774154, |
|
"loss": 0.6465, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.48575712143928035, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002973491975892439, |
|
"loss": 0.6464, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.4917541229385307, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002972723219017727, |
|
"loss": 0.6439, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.49775112443778113, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002971943575832639, |
|
"loss": 0.6623, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.5037481259370314, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002971153052100265, |
|
"loss": 0.6793, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5097451274362819, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002970351653664125, |
|
"loss": 0.6144, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5157421289355323, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00029695393864481224, |
|
"loss": 0.5845, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002968716256456505, |
|
"loss": 0.6055, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.527736131934033, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00029678822697738153, |
|
"loss": 0.6746, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.5337331334332833, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.000296703743256485, |
|
"loss": 0.6383, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.5397301349325337, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002966181751074611, |
|
"loss": 0.6634, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.545727136431784, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00029653152316282615, |
|
"loss": 0.6992, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00029644378806310774, |
|
"loss": 0.6535, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.5577211394302849, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0002963549704568403, |
|
"loss": 0.6474, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.5637181409295352, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0002962650710005599, |
|
"loss": 0.6175, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.5697151424287856, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00029617409035879967, |
|
"loss": 0.7, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5757121439280359, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002960820292040848, |
|
"loss": 0.6635, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.5817091454272864, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00029598888821692776, |
|
"loss": 0.6896, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.5877061469265368, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00029589466808582277, |
|
"loss": 0.6824, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.5937031484257871, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00029579936950724134, |
|
"loss": 0.598, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.5997001499250375, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002957029931856267, |
|
"loss": 0.6196, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6056971514242878, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.0002956055398333886, |
|
"loss": 0.682, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.6116941529235382, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00029550701017089844, |
|
"loss": 0.6669, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.6176911544227887, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00029540740492648343, |
|
"loss": 0.6382, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.623688155922039, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002953067248364214, |
|
"loss": 0.6614, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.6296851574212894, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0002952049706449356, |
|
"loss": 0.7027, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6356821589205397, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00029510214310418887, |
|
"loss": 0.6834, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.6416791604197901, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00029499824297427827, |
|
"loss": 0.6876, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.6476761619190404, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00029489327102322926, |
|
"loss": 0.6574, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.6536731634182908, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002947872280269904, |
|
"loss": 0.6296, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.6596701649175413, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.000294680114769427, |
|
"loss": 0.5848, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6656671664167916, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002945719320423161, |
|
"loss": 0.6623, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.671664167916042, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00029446268064534, |
|
"loss": 0.643, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.6776611694152923, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002943523613860805, |
|
"loss": 0.5834, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.6836581709145427, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002942409750800133, |
|
"loss": 0.6101, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00029412852255050124, |
|
"loss": 0.6145, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.000294015004628789, |
|
"loss": 0.5801, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.7016491754122939, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002939004221539964, |
|
"loss": 0.6429, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.7076461769115442, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0002937847759731125, |
|
"loss": 0.6359, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.7136431784107946, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002936680669409891, |
|
"loss": 0.6487, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.719640179910045, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00029355029592033474, |
|
"loss": 0.6244, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7256371814092953, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.000293431463781708, |
|
"loss": 0.6023, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.7316341829085458, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002933115714035112, |
|
"loss": 0.6105, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.7376311844077961, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00029319061967198395, |
|
"loss": 0.6146, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.7436281859070465, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0002930686094811966, |
|
"loss": 0.5759, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.7496251874062968, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002929455417330435, |
|
"loss": 0.6215, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7556221889055472, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002928214173372364, |
|
"loss": 0.5969, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.7616191904047976, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00029269623721129797, |
|
"loss": 0.6657, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.767616191904048, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00029257000228055446, |
|
"loss": 0.5872, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.7736131934032984, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00029244271347812946, |
|
"loss": 0.5736, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.7796101949025487, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00029231437174493654, |
|
"loss": 0.6027, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7856071964017991, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00029218497802967273, |
|
"loss": 0.6296, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.7916041979010495, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002920545332888111, |
|
"loss": 0.5929, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.7976011994002998, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00029192303848659377, |
|
"loss": 0.636, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.8035982008995503, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002917904945950252, |
|
"loss": 0.6177, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.8095952023988006, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00029165690259386423, |
|
"loss": 0.6226, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.815592203898051, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002915222634706177, |
|
"loss": 0.6155, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.8215892053973014, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00029138657822053247, |
|
"loss": 0.6098, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00029124984784658844, |
|
"loss": 0.5997, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.8335832083958021, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.000291112073359491, |
|
"loss": 0.6189, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.8395802098950524, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00029097325577766357, |
|
"loss": 0.5949, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8455772113943029, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00029083339612724006, |
|
"loss": 0.6277, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.8515742128935532, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00029069249544205744, |
|
"loss": 0.5951, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.8575712143928036, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00029055055476364777, |
|
"loss": 0.624, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.863568215892054, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00029040757514123077, |
|
"loss": 0.6465, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00029026355763170613, |
|
"loss": 0.6299, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8755622188905547, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00029011850329964536, |
|
"loss": 0.6217, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.881559220389805, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002899724132172842, |
|
"loss": 0.6225, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.8875562218890555, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00028982528846451466, |
|
"loss": 0.5979, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.8935532233883059, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.000289677130128877, |
|
"loss": 0.6094, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.8995502248875562, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00028952793930555156, |
|
"loss": 0.6134, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9055472263868066, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00028937771709735085, |
|
"loss": 0.6125, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.9115442278860569, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00028922646461471146, |
|
"loss": 0.6229, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.9175412293853074, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00028907418297568544, |
|
"loss": 0.6114, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.9235382308845578, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00028892087330593263, |
|
"loss": 0.6052, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.9295352323838081, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002887665367387119, |
|
"loss": 0.5971, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9355322338830585, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00028861117441487277, |
|
"loss": 0.563, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.9415292353823088, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00028845478748284743, |
|
"loss": 0.5906, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.9475262368815592, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002882973770986416, |
|
"loss": 0.5841, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.9535232383808095, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00028813894442582656, |
|
"loss": 0.6249, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.95952023988006, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00028797949063553014, |
|
"loss": 0.5862, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00028781901690642833, |
|
"loss": 0.5564, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.9715142428785607, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002876575244247364, |
|
"loss": 0.6202, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.9775112443778111, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00028749501438420034, |
|
"loss": 0.6844, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.9835082458770614, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00028733148798608767, |
|
"loss": 0.6133, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.9895052473763118, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002871669464391789, |
|
"loss": 0.5914, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9955022488755623, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0002870013909597586, |
|
"loss": 0.5781, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.0014992503748126, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.000286834822771606, |
|
"loss": 0.5998, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.0074962518740629, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00028666724310598657, |
|
"loss": 0.5466, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.0134932533733134, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002864986532016423, |
|
"loss": 0.4778, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.0194902548725637, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00028632905430478294, |
|
"loss": 0.4739, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.025487256371814, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002861584476690767, |
|
"loss": 0.51, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.0314842578710646, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002859868345556409, |
|
"loss": 0.5517, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.0374812593703149, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00028581421623303274, |
|
"loss": 0.5065, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.0434782608695652, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002856405939772398, |
|
"loss": 0.5512, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.0494752623688155, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00028546596907167094, |
|
"loss": 0.5293, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.055472263868066, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002852903428071462, |
|
"loss": 0.5048, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.0614692653673163, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00028511371648188785, |
|
"loss": 0.5045, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.0674662668665666, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002849360914015106, |
|
"loss": 0.486, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.0734632683658172, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002847574688790118, |
|
"loss": 0.5105, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.0794602698650675, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00028457785023476193, |
|
"loss": 0.5176, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0854572713643178, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00028439723679649467, |
|
"loss": 0.4982, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.0914542728635683, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00028421562989929726, |
|
"loss": 0.5004, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.0974512743628186, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002840330308856006, |
|
"loss": 0.5341, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0002838494411051692, |
|
"loss": 0.5225, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.1094452773613193, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00028366486191509115, |
|
"loss": 0.5249, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1154422788605698, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00028347929467976843, |
|
"loss": 0.4945, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.12143928035982, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00028329274077090657, |
|
"loss": 0.4733, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.1274362818590704, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002831052015675044, |
|
"loss": 0.5443, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.133433283358321, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002829166784558442, |
|
"loss": 0.5287, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.1394302848575713, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002827271728294812, |
|
"loss": 0.4699, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1454272863568216, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00028253668608923323, |
|
"loss": 0.5091, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.1514242878560719, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002823452196431706, |
|
"loss": 0.4919, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.1574212893553224, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002821527749066055, |
|
"loss": 0.5538, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.1634182908545727, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00028195935330208163, |
|
"loss": 0.5304, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.169415292353823, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002817649562593637, |
|
"loss": 0.5099, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1754122938530736, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002815695852154267, |
|
"loss": 0.5286, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.1814092953523239, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00028137324161444554, |
|
"loss": 0.5302, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.1874062968515742, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00028117592690778413, |
|
"loss": 0.489, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.1934032983508245, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0002809776425539848, |
|
"loss": 0.4831, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.199400299850075, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00028077839001875744, |
|
"loss": 0.5265, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2053973013493253, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0002805781707749688, |
|
"loss": 0.4821, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.2113943028485756, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0002803769863026313, |
|
"loss": 0.4793, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.2173913043478262, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00028017483808889245, |
|
"loss": 0.5088, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.2233883058470765, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002799717276280237, |
|
"loss": 0.5152, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.2293853073463268, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00027976765642140935, |
|
"loss": 0.595, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.235382308845577, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00027956262597753545, |
|
"loss": 0.536, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002793566378119787, |
|
"loss": 0.5102, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.247376311844078, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00027914969344739545, |
|
"loss": 0.5385, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.2533733133433285, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002789417944135098, |
|
"loss": 0.5201, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.2593703148425788, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002787329422471032, |
|
"loss": 0.5126, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.265367316341829, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002785231384920023, |
|
"loss": 0.4304, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.2713643178410794, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.000278312384699068, |
|
"loss": 0.5052, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.2773613193403297, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002781006824261838, |
|
"loss": 0.5248, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.2833583208395802, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002778880332382443, |
|
"loss": 0.5219, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.2893553223388305, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0002776744387071437, |
|
"loss": 0.5177, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.295352323838081, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00027745990041176406, |
|
"loss": 0.5015, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.3013493253373314, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00027724441993796386, |
|
"loss": 0.5045, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.3073463268365817, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.000277027998878566, |
|
"loss": 0.5399, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.313343328335832, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002768106388333462, |
|
"loss": 0.4533, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.3193403298350825, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002765923414090211, |
|
"loss": 0.4942, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3253373313343328, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00027637310821923637, |
|
"loss": 0.4559, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.3313343328335832, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00027615294088455494, |
|
"loss": 0.4603, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.3373313343328337, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00027593184103244474, |
|
"loss": 0.5045, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.343328335832084, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.000275709810297267, |
|
"loss": 0.5183, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.3493253373313343, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00027548685032026393, |
|
"loss": 0.5529, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3553223388305846, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002752629627495466, |
|
"loss": 0.5169, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.3613193403298351, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0002750381492400829, |
|
"loss": 0.5303, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.3673163418290855, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002748124114536852, |
|
"loss": 0.5258, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.3733133433283358, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002745857510589979, |
|
"loss": 0.5352, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00027435816973148564, |
|
"loss": 0.5202, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3853073463268366, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002741296691534204, |
|
"loss": 0.4443, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002739002510138691, |
|
"loss": 0.4865, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.3973013493253372, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00027366991700868127, |
|
"loss": 0.5044, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.4032983508245878, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00027343866884047674, |
|
"loss": 0.4876, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.409295352323838, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002732065082186324, |
|
"loss": 0.5361, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4152923538230884, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00027297343685927036, |
|
"loss": 0.4938, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.421289355322339, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002727394564852445, |
|
"loss": 0.5098, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.4272863568215892, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002725045688261283, |
|
"loss": 0.5342, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.4332833583208395, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00027226877561820187, |
|
"loss": 0.48, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.4392803598200898, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0002720320786044391, |
|
"loss": 0.4997, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4452773613193404, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002717944795344946, |
|
"loss": 0.5382, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.4512743628185907, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00027155598016469115, |
|
"loss": 0.5305, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.4572713643178412, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00027131658225800637, |
|
"loss": 0.5172, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.4632683658170915, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00027107628758405995, |
|
"loss": 0.5318, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.4692653673163418, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002708350979191004, |
|
"loss": 0.5143, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4752623688155921, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00027059301504599187, |
|
"loss": 0.4811, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.4812593703148424, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002703500407542012, |
|
"loss": 0.4862, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.487256371814093, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00027010617683978456, |
|
"loss": 0.5058, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.4932533733133433, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00026986142510537406, |
|
"loss": 0.4691, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.4992503748125938, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002696157873601646, |
|
"loss": 0.5224, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5052473763118441, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00026936926541990046, |
|
"loss": 0.5588, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 1.5112443778110944, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00026912186110686186, |
|
"loss": 0.486, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 1.5172413793103448, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002688735762498515, |
|
"loss": 0.5366, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 1.523238380809595, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00026862441268418085, |
|
"loss": 0.5101, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 1.5292353823088456, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.000268374372251657, |
|
"loss": 0.5154, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.535232383808096, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00026812345680056867, |
|
"loss": 0.5155, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 1.5412293853073464, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00026787166818567263, |
|
"loss": 0.5368, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 1.5472263868065967, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00026761900826818033, |
|
"loss": 0.537, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 1.553223388305847, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002673654789157435, |
|
"loss": 0.5323, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 1.5592203898050974, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002671110820024408, |
|
"loss": 0.5142, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5652173913043477, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00026685581940876396, |
|
"loss": 0.5343, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 1.5712143928035982, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00026659969302160377, |
|
"loss": 0.5076, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 1.5772113943028487, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00026634270473423606, |
|
"loss": 0.499, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 1.583208395802099, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002660848564463079, |
|
"loss": 0.485, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 1.5892053973013494, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00026582615006382333, |
|
"loss": 0.5186, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.5952023988005997, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00026556658749912944, |
|
"loss": 0.5256, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 1.60119940029985, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00026530617067090225, |
|
"loss": 0.5223, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 1.6071964017991005, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002650449015041324, |
|
"loss": 0.509, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 1.6131934032983508, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002647827819301109, |
|
"loss": 0.5089, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 1.6191904047976013, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002645198138864151, |
|
"loss": 0.4925, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6251874062968517, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002642559993168942, |
|
"loss": 0.5303, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 1.631184407796102, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002639913401716546, |
|
"loss": 0.5077, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 1.6371814092953523, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002637258384070461, |
|
"loss": 0.5554, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 1.6431784107946026, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002634594959856471, |
|
"loss": 0.4447, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 1.6491754122938531, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00026319231487624984, |
|
"loss": 0.4951, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002629242970538463, |
|
"loss": 0.5053, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 1.661169415292354, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0002626554444996133, |
|
"loss": 0.4702, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 1.6671664167916043, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002623857592008982, |
|
"loss": 0.477, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 1.6731634182908546, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00026211524315120365, |
|
"loss": 0.4858, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 1.6791604197901049, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002618438983501734, |
|
"loss": 0.4938, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.6851574212893552, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00026157172680357717, |
|
"loss": 0.4687, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 1.6911544227886057, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002612987305232961, |
|
"loss": 0.4976, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 1.697151424287856, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002610249115273075, |
|
"loss": 0.5319, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 1.7031484257871066, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0002607502718396705, |
|
"loss": 0.5139, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 1.7091454272863569, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002604748134905103, |
|
"loss": 0.4979, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7151424287856072, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00026019853851600404, |
|
"loss": 0.5016, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 1.7211394302848575, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00025992144895836504, |
|
"loss": 0.4872, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 1.7271364317841078, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002596435468658282, |
|
"loss": 0.5164, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 1.7331334332833583, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00025936483429263437, |
|
"loss": 0.4904, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00025908531329901574, |
|
"loss": 0.5198, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7451274362818592, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002588049859511801, |
|
"loss": 0.5574, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 1.7511244377811095, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00025852385432129587, |
|
"loss": 0.5086, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 1.7571214392803598, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002582419204874767, |
|
"loss": 0.5387, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 1.76311844077961, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.000257959186533766, |
|
"loss": 0.5478, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 1.7691154422788604, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002576756545501218, |
|
"loss": 0.4899, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.775112443778111, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002573913266324009, |
|
"loss": 0.4824, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 1.7811094452773615, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00025710620488234384, |
|
"loss": 0.5113, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 1.7871064467766118, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002568202914075591, |
|
"loss": 0.5235, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 1.793103448275862, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002565335883215074, |
|
"loss": 0.5289, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 1.7991004497751124, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00025624609774348633, |
|
"loss": 0.5018, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8050974512743627, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002559578217986147, |
|
"loss": 0.5799, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 1.811094452773613, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00025566876261781657, |
|
"loss": 0.5077, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 1.8170914542728636, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00025537892233780564, |
|
"loss": 0.561, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 1.823088455772114, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002550883031010696, |
|
"loss": 0.4929, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 1.8290854572713644, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00025479690705585393, |
|
"loss": 0.5342, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8350824587706147, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0002545047363561466, |
|
"loss": 0.5061, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 1.841079460269865, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00025421179316166147, |
|
"loss": 0.5237, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 1.8470764617691153, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00025391807963782276, |
|
"loss": 0.4967, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 1.8530734632683659, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.000253623597955749, |
|
"loss": 0.5285, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 1.8590704647676162, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002533283502922368, |
|
"loss": 0.4559, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.8650674662668667, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.000253032338829745, |
|
"loss": 0.4359, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 1.871064467766117, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00025273556575637824, |
|
"loss": 0.4478, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 1.8770614692653673, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00025243803326587113, |
|
"loss": 0.4902, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 1.8830584707646176, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002521397435575717, |
|
"loss": 0.4718, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 1.889055472263868, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002518406988364255, |
|
"loss": 0.4678, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.8950524737631185, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002515409013129589, |
|
"loss": 0.4982, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 1.9010494752623688, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002512403532032632, |
|
"loss": 0.5777, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 1.9070464767616193, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002509390567289776, |
|
"loss": 0.4771, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 1.9130434782608696, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002506370141172737, |
|
"loss": 0.4811, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 1.91904047976012, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00025033422760083814, |
|
"loss": 0.4656, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9250374812593702, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00025003069941785647, |
|
"loss": 0.5288, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00024972643181199694, |
|
"loss": 0.4915, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 1.937031484257871, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00024942142703239317, |
|
"loss": 0.4914, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 1.9430284857571214, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002491156873336282, |
|
"loss": 0.5417, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 1.949025487256372, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002488092149757176, |
|
"loss": 0.5118, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9550224887556222, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00024850201222409245, |
|
"loss": 0.4948, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 1.9610194902548725, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00024819408134958324, |
|
"loss": 0.5132, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 1.9670164917541229, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00024788542462840236, |
|
"loss": 0.4743, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 1.9730134932533732, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00024757604434212785, |
|
"loss": 0.5555, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 1.9790104947526237, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00024726594277768625, |
|
"loss": 0.496, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9850074962518742, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002469551222273358, |
|
"loss": 0.4981, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 1.9910044977511245, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002466435849886494, |
|
"loss": 0.5064, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 1.9970014992503748, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002463313333644976, |
|
"loss": 0.4856, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 2.002998500749625, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0002460183696630319, |
|
"loss": 0.4316, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 2.0089955022488755, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002457046961976672, |
|
"loss": 0.4442, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.0149925037481258, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002453903152870651, |
|
"loss": 0.3908, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 2.0209895052473765, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00024507522925511655, |
|
"loss": 0.3686, |
|
"step": 1348 |
|
}, |
|
{ |
|
"epoch": 2.026986506746627, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00024475944043092474, |
|
"loss": 0.3864, |
|
"step": 1352 |
|
}, |
|
{ |
|
"epoch": 2.032983508245877, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00024444295114878787, |
|
"loss": 0.3697, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 2.0389805097451275, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00024412576374818184, |
|
"loss": 0.3737, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.0449775112443778, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00024380788057374315, |
|
"loss": 0.4196, |
|
"step": 1364 |
|
}, |
|
{ |
|
"epoch": 2.050974512743628, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00024348930397525125, |
|
"loss": 0.3743, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 2.0569715142428784, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00024317003630761156, |
|
"loss": 0.3874, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 2.062968515742129, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00024285007993083763, |
|
"loss": 0.3758, |
|
"step": 1376 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00024252943721003416, |
|
"loss": 0.4214, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.0749625187406298, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00024220811051537902, |
|
"loss": 0.4145, |
|
"step": 1384 |
|
}, |
|
{ |
|
"epoch": 2.08095952023988, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00024188610222210624, |
|
"loss": 0.3586, |
|
"step": 1388 |
|
}, |
|
{ |
|
"epoch": 2.0869565217391304, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00024156341471048801, |
|
"loss": 0.4311, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 2.0929535232383807, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00024124005036581738, |
|
"loss": 0.3881, |
|
"step": 1396 |
|
}, |
|
{ |
|
"epoch": 2.098950524737631, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002409160115783905, |
|
"loss": 0.4552, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.1049475262368817, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00024059130074348888, |
|
"loss": 0.4048, |
|
"step": 1404 |
|
}, |
|
{ |
|
"epoch": 2.110944527736132, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002402659202613619, |
|
"loss": 0.3692, |
|
"step": 1408 |
|
}, |
|
{ |
|
"epoch": 2.1169415292353824, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00023993987253720896, |
|
"loss": 0.418, |
|
"step": 1412 |
|
}, |
|
{ |
|
"epoch": 2.1229385307346327, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00023961315998116158, |
|
"loss": 0.4435, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 2.128935532233883, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002392857850082657, |
|
"loss": 0.4075, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.1349325337331333, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00023895775003846388, |
|
"loss": 0.4119, |
|
"step": 1424 |
|
}, |
|
{ |
|
"epoch": 2.1409295352323836, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00023862905749657743, |
|
"loss": 0.3709, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 2.1469265367316344, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0002382997098122882, |
|
"loss": 0.379, |
|
"step": 1432 |
|
}, |
|
{ |
|
"epoch": 2.1529235382308847, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002379697094201209, |
|
"loss": 0.3731, |
|
"step": 1436 |
|
}, |
|
{ |
|
"epoch": 2.158920539730135, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00023763905875942516, |
|
"loss": 0.3762, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.1649175412293853, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002373077602743572, |
|
"loss": 0.4093, |
|
"step": 1444 |
|
}, |
|
{ |
|
"epoch": 2.1709145427286356, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00023697581641386208, |
|
"loss": 0.3765, |
|
"step": 1448 |
|
}, |
|
{ |
|
"epoch": 2.176911544227886, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00023664322963165527, |
|
"loss": 0.4056, |
|
"step": 1452 |
|
}, |
|
{ |
|
"epoch": 2.1829085457271367, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00023631000238620483, |
|
"loss": 0.4, |
|
"step": 1456 |
|
}, |
|
{ |
|
"epoch": 2.188905547226387, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00023597613714071308, |
|
"loss": 0.4249, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.1949025487256373, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00023564163636309837, |
|
"loss": 0.3966, |
|
"step": 1464 |
|
}, |
|
{ |
|
"epoch": 2.2008995502248876, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00023530650252597693, |
|
"loss": 0.3794, |
|
"step": 1468 |
|
}, |
|
{ |
|
"epoch": 2.206896551724138, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00023497073810664442, |
|
"loss": 0.4001, |
|
"step": 1472 |
|
}, |
|
{ |
|
"epoch": 2.212893553223388, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00023463434558705792, |
|
"loss": 0.4304, |
|
"step": 1476 |
|
}, |
|
{ |
|
"epoch": 2.2188905547226385, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00023429732745381733, |
|
"loss": 0.3824, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.224887556221889, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00023395968619814692, |
|
"loss": 0.3911, |
|
"step": 1484 |
|
}, |
|
{ |
|
"epoch": 2.2308845577211396, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00023362142431587727, |
|
"loss": 0.3931, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 2.23688155922039, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002332825443074265, |
|
"loss": 0.4401, |
|
"step": 1492 |
|
}, |
|
{ |
|
"epoch": 2.24287856071964, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00023294304867778183, |
|
"loss": 0.3967, |
|
"step": 1496 |
|
}, |
|
{ |
|
"epoch": 2.2488755622188905, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00023260293993648126, |
|
"loss": 0.4004, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.254872563718141, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00023226222059759486, |
|
"loss": 0.3928, |
|
"step": 1504 |
|
}, |
|
{ |
|
"epoch": 2.260869565217391, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00023192089317970616, |
|
"loss": 0.3957, |
|
"step": 1508 |
|
}, |
|
{ |
|
"epoch": 2.266866566716642, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00023157896020589353, |
|
"loss": 0.4173, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 2.272863568215892, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00023123642420371177, |
|
"loss": 0.4401, |
|
"step": 1516 |
|
}, |
|
{ |
|
"epoch": 2.2788605697151425, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002308932877051731, |
|
"loss": 0.4012, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.284857571214393, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002305495532467286, |
|
"loss": 0.4244, |
|
"step": 1524 |
|
}, |
|
{ |
|
"epoch": 2.290854572713643, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00023020522336924943, |
|
"loss": 0.4158, |
|
"step": 1528 |
|
}, |
|
{ |
|
"epoch": 2.2968515742128934, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00022986030061800816, |
|
"loss": 0.4394, |
|
"step": 1532 |
|
}, |
|
{ |
|
"epoch": 2.3028485757121437, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00022951478754265977, |
|
"loss": 0.3715, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 2.3088455772113945, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00022916868669722293, |
|
"loss": 0.3814, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.314842578710645, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00022882200064006097, |
|
"loss": 0.3815, |
|
"step": 1544 |
|
}, |
|
{ |
|
"epoch": 2.320839580209895, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00022847473193386334, |
|
"loss": 0.3833, |
|
"step": 1548 |
|
}, |
|
{ |
|
"epoch": 2.3268365817091454, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00022812688314562615, |
|
"loss": 0.3981, |
|
"step": 1552 |
|
}, |
|
{ |
|
"epoch": 2.3328335832083957, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002277784568466336, |
|
"loss": 0.4014, |
|
"step": 1556 |
|
}, |
|
{ |
|
"epoch": 2.338830584707646, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0002274294556124387, |
|
"loss": 0.413, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.344827586206897, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00022707988202284453, |
|
"loss": 0.4232, |
|
"step": 1564 |
|
}, |
|
{ |
|
"epoch": 2.350824587706147, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00022672973866188484, |
|
"loss": 0.4016, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 2.3568215892053974, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0002263790281178052, |
|
"loss": 0.4247, |
|
"step": 1572 |
|
}, |
|
{ |
|
"epoch": 2.3628185907046477, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00022602775298304374, |
|
"loss": 0.393, |
|
"step": 1576 |
|
}, |
|
{ |
|
"epoch": 2.368815592203898, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00022567591585421202, |
|
"loss": 0.3931, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.3748125937031483, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00022532351933207584, |
|
"loss": 0.3926, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 2.3808095952023987, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00022497056602153602, |
|
"loss": 0.3971, |
|
"step": 1588 |
|
}, |
|
{ |
|
"epoch": 2.386806596701649, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00022461705853160912, |
|
"loss": 0.4126, |
|
"step": 1592 |
|
}, |
|
{ |
|
"epoch": 2.3928035982008997, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00022426299947540825, |
|
"loss": 0.3858, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 2.39880059970015, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00022390839147012353, |
|
"loss": 0.4325, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.4047976011994003, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00022355323713700302, |
|
"loss": 0.3314, |
|
"step": 1604 |
|
}, |
|
{ |
|
"epoch": 2.4107946026986506, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00022319753910133314, |
|
"loss": 0.4244, |
|
"step": 1608 |
|
}, |
|
{ |
|
"epoch": 2.416791604197901, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0002228412999924194, |
|
"loss": 0.4494, |
|
"step": 1612 |
|
}, |
|
{ |
|
"epoch": 2.4227886056971513, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00022248452244356677, |
|
"loss": 0.4027, |
|
"step": 1616 |
|
}, |
|
{ |
|
"epoch": 2.428785607196402, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00022212720909206056, |
|
"loss": 0.4296, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.4347826086956523, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00022176936257914647, |
|
"loss": 0.377, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 2.4407796101949026, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0002214109855500115, |
|
"loss": 0.4368, |
|
"step": 1628 |
|
}, |
|
{ |
|
"epoch": 2.446776611694153, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00022105208065376417, |
|
"loss": 0.4073, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 2.4527736131934033, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0002206926505434148, |
|
"loss": 0.4051, |
|
"step": 1636 |
|
}, |
|
{ |
|
"epoch": 2.4587706146926536, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00022033269787585634, |
|
"loss": 0.4175, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.464767616191904, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00021997222531184427, |
|
"loss": 0.4093, |
|
"step": 1644 |
|
}, |
|
{ |
|
"epoch": 2.470764617691154, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0002196112355159772, |
|
"loss": 0.4557, |
|
"step": 1648 |
|
}, |
|
{ |
|
"epoch": 2.476761619190405, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.000219249731156677, |
|
"loss": 0.3951, |
|
"step": 1652 |
|
}, |
|
{ |
|
"epoch": 2.4827586206896552, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00021888771490616936, |
|
"loss": 0.4413, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 2.4887556221889056, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002185251894404637, |
|
"loss": 0.3882, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.494752623688156, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00021816215743933359, |
|
"loss": 0.4303, |
|
"step": 1664 |
|
}, |
|
{ |
|
"epoch": 2.500749625187406, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0002177986215862968, |
|
"loss": 0.3868, |
|
"step": 1668 |
|
}, |
|
{ |
|
"epoch": 2.506746626686657, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0002174345845685957, |
|
"loss": 0.4185, |
|
"step": 1672 |
|
}, |
|
{ |
|
"epoch": 2.5127436281859072, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00021707004907717717, |
|
"loss": 0.4411, |
|
"step": 1676 |
|
}, |
|
{ |
|
"epoch": 2.5187406296851576, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00021670501780667284, |
|
"loss": 0.449, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.524737631184408, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00021633949345537895, |
|
"loss": 0.4258, |
|
"step": 1684 |
|
}, |
|
{ |
|
"epoch": 2.530734632683658, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002159734787252368, |
|
"loss": 0.4221, |
|
"step": 1688 |
|
}, |
|
{ |
|
"epoch": 2.5367316341829085, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00021560697632181243, |
|
"loss": 0.3824, |
|
"step": 1692 |
|
}, |
|
{ |
|
"epoch": 2.542728635682159, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00021523998895427675, |
|
"loss": 0.4164, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 2.548725637181409, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00021487251933538547, |
|
"loss": 0.3595, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.5547226386806594, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00021450457018145925, |
|
"loss": 0.3977, |
|
"step": 1704 |
|
}, |
|
{ |
|
"epoch": 2.56071964017991, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00021413614421236313, |
|
"loss": 0.4427, |
|
"step": 1708 |
|
}, |
|
{ |
|
"epoch": 2.5667166416791605, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00021376724415148718, |
|
"loss": 0.3741, |
|
"step": 1712 |
|
}, |
|
{ |
|
"epoch": 2.572713643178411, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00021339787272572555, |
|
"loss": 0.3822, |
|
"step": 1716 |
|
}, |
|
{ |
|
"epoch": 2.578710644677661, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00021302803266545696, |
|
"loss": 0.4308, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.5847076461769114, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00021265772670452402, |
|
"loss": 0.3995, |
|
"step": 1724 |
|
}, |
|
{ |
|
"epoch": 2.590704647676162, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0002122869575802135, |
|
"loss": 0.3994, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 2.5967016491754125, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00021191572803323571, |
|
"loss": 0.3803, |
|
"step": 1732 |
|
}, |
|
{ |
|
"epoch": 2.6026986506746628, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00021154404080770447, |
|
"loss": 0.4211, |
|
"step": 1736 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00021117189865111664, |
|
"loss": 0.4121, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.6146926536731634, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00021079930431433197, |
|
"loss": 0.3982, |
|
"step": 1744 |
|
}, |
|
{ |
|
"epoch": 2.6206896551724137, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00021042626055155266, |
|
"loss": 0.4339, |
|
"step": 1748 |
|
}, |
|
{ |
|
"epoch": 2.626686656671664, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00021005277012030324, |
|
"loss": 0.4151, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 2.6326836581709143, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00020967883578140966, |
|
"loss": 0.3805, |
|
"step": 1756 |
|
}, |
|
{ |
|
"epoch": 2.638680659670165, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002093044602989796, |
|
"loss": 0.4125, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.6446776611694154, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002089296464403813, |
|
"loss": 0.4266, |
|
"step": 1764 |
|
}, |
|
{ |
|
"epoch": 2.6506746626686657, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00020855439697622374, |
|
"loss": 0.4417, |
|
"step": 1768 |
|
}, |
|
{ |
|
"epoch": 2.656671664167916, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00020817871468033566, |
|
"loss": 0.4165, |
|
"step": 1772 |
|
}, |
|
{ |
|
"epoch": 2.6626686656671663, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00020780260232974545, |
|
"loss": 0.4082, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 2.668665667166417, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00020742606270466026, |
|
"loss": 0.4115, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.6746626686656674, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0002070490985884459, |
|
"loss": 0.3905, |
|
"step": 1784 |
|
}, |
|
{ |
|
"epoch": 2.6806596701649177, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00020667171276760567, |
|
"loss": 0.3935, |
|
"step": 1788 |
|
}, |
|
{ |
|
"epoch": 2.686656671664168, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00020629390803176046, |
|
"loss": 0.4366, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 2.6926536731634183, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0002059156871736274, |
|
"loss": 0.4184, |
|
"step": 1796 |
|
}, |
|
{ |
|
"epoch": 2.6986506746626686, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0002055370529889999, |
|
"loss": 0.395, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.704647676161919, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00020515800827672638, |
|
"loss": 0.3656, |
|
"step": 1804 |
|
}, |
|
{ |
|
"epoch": 2.7106446776611692, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00020477855583869015, |
|
"loss": 0.4209, |
|
"step": 1808 |
|
}, |
|
{ |
|
"epoch": 2.7166416791604195, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0002043986984797881, |
|
"loss": 0.4143, |
|
"step": 1812 |
|
}, |
|
{ |
|
"epoch": 2.7226386806596703, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00020401843900791055, |
|
"loss": 0.4105, |
|
"step": 1816 |
|
}, |
|
{ |
|
"epoch": 2.7286356821589206, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00020363778023392, |
|
"loss": 0.4174, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.734632683658171, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00020325672497163087, |
|
"loss": 0.4063, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 2.7406296851574212, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00020287527603778804, |
|
"loss": 0.4233, |
|
"step": 1828 |
|
}, |
|
{ |
|
"epoch": 2.7466266866566715, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0002024934362520467, |
|
"loss": 0.4659, |
|
"step": 1832 |
|
}, |
|
{ |
|
"epoch": 2.7526236881559223, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.000202111208436951, |
|
"loss": 0.4075, |
|
"step": 1836 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00020172859541791352, |
|
"loss": 0.4011, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.764617691154423, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00020134560002319418, |
|
"loss": 0.4006, |
|
"step": 1844 |
|
}, |
|
{ |
|
"epoch": 2.770614692653673, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00020096222508387938, |
|
"loss": 0.4012, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 2.7766116941529235, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00020057847343386124, |
|
"loss": 0.4657, |
|
"step": 1852 |
|
}, |
|
{ |
|
"epoch": 2.782608695652174, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0002001943479098163, |
|
"loss": 0.3579, |
|
"step": 1856 |
|
}, |
|
{ |
|
"epoch": 2.788605697151424, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001998098513511849, |
|
"loss": 0.4232, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.7946026986506745, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001994249866001501, |
|
"loss": 0.4228, |
|
"step": 1864 |
|
}, |
|
{ |
|
"epoch": 2.8005997001499248, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00019903975650161648, |
|
"loss": 0.4214, |
|
"step": 1868 |
|
}, |
|
{ |
|
"epoch": 2.8065967016491755, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019865416390318935, |
|
"loss": 0.4308, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 2.812593703148426, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001982682116551536, |
|
"loss": 0.3585, |
|
"step": 1876 |
|
}, |
|
{ |
|
"epoch": 2.818590704647676, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019788190261045248, |
|
"loss": 0.4224, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.8245877061469264, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.000197495239624667, |
|
"loss": 0.4206, |
|
"step": 1884 |
|
}, |
|
{ |
|
"epoch": 2.8305847076461768, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019710822555599417, |
|
"loss": 0.4052, |
|
"step": 1888 |
|
}, |
|
{ |
|
"epoch": 2.8365817091454275, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019672086326522634, |
|
"loss": 0.399, |
|
"step": 1892 |
|
}, |
|
{ |
|
"epoch": 2.842578710644678, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001963331556157298, |
|
"loss": 0.387, |
|
"step": 1896 |
|
}, |
|
{ |
|
"epoch": 2.848575712143928, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001959451054734239, |
|
"loss": 0.3893, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.8545727136431784, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00019555671570675953, |
|
"loss": 0.3967, |
|
"step": 1904 |
|
}, |
|
{ |
|
"epoch": 2.8605697151424287, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019516798918669807, |
|
"loss": 0.4241, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 2.866566716641679, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00019477892878669021, |
|
"loss": 0.4166, |
|
"step": 1912 |
|
}, |
|
{ |
|
"epoch": 2.8725637181409294, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019438953738265479, |
|
"loss": 0.3727, |
|
"step": 1916 |
|
}, |
|
{ |
|
"epoch": 2.8785607196401797, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001939998178529571, |
|
"loss": 0.3908, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.8845577211394304, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019360977307838833, |
|
"loss": 0.3843, |
|
"step": 1924 |
|
}, |
|
{ |
|
"epoch": 2.8905547226386807, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001932194059421435, |
|
"loss": 0.4424, |
|
"step": 1928 |
|
}, |
|
{ |
|
"epoch": 2.896551724137931, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001928287193298007, |
|
"loss": 0.3926, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 2.9025487256371814, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00019243771612929955, |
|
"loss": 0.4391, |
|
"step": 1936 |
|
}, |
|
{ |
|
"epoch": 2.9085457271364317, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001920463992309199, |
|
"loss": 0.4248, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.9145427286356824, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019165477152726035, |
|
"loss": 0.4236, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 2.9205397301349327, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001912628359132171, |
|
"loss": 0.4503, |
|
"step": 1948 |
|
}, |
|
{ |
|
"epoch": 2.926536731634183, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019087059528596223, |
|
"loss": 0.4249, |
|
"step": 1952 |
|
}, |
|
{ |
|
"epoch": 2.9325337331334334, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00019047805254492265, |
|
"loss": 0.4145, |
|
"step": 1956 |
|
}, |
|
{ |
|
"epoch": 2.9385307346326837, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001900852105917584, |
|
"loss": 0.3811, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.944527736131934, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00018969207233034127, |
|
"loss": 0.3733, |
|
"step": 1964 |
|
}, |
|
{ |
|
"epoch": 2.9505247376311843, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001892986406667333, |
|
"loss": 0.4685, |
|
"step": 1968 |
|
}, |
|
{ |
|
"epoch": 2.9565217391304346, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001889049185091655, |
|
"loss": 0.4259, |
|
"step": 1972 |
|
}, |
|
{ |
|
"epoch": 2.962518740629685, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00018851090876801605, |
|
"loss": 0.4425, |
|
"step": 1976 |
|
}, |
|
{ |
|
"epoch": 2.9685157421289357, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00018811661435578903, |
|
"loss": 0.3932, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.974512743628186, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00018772203818709273, |
|
"loss": 0.4028, |
|
"step": 1984 |
|
}, |
|
{ |
|
"epoch": 2.9805097451274363, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001873271831786183, |
|
"loss": 0.4215, |
|
"step": 1988 |
|
}, |
|
{ |
|
"epoch": 2.9865067466266866, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00018693205224911777, |
|
"loss": 0.4076, |
|
"step": 1992 |
|
}, |
|
{ |
|
"epoch": 2.992503748125937, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018653664831938318, |
|
"loss": 0.4261, |
|
"step": 1996 |
|
}, |
|
{ |
|
"epoch": 2.9985007496251876, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00018614097431222425, |
|
"loss": 0.4096, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.004497751124438, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018574503315244722, |
|
"loss": 0.3218, |
|
"step": 2004 |
|
}, |
|
{ |
|
"epoch": 3.0104947526236883, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0001853488277668331, |
|
"loss": 0.2858, |
|
"step": 2008 |
|
}, |
|
{ |
|
"epoch": 3.0164917541229386, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001849523610841161, |
|
"loss": 0.33, |
|
"step": 2012 |
|
}, |
|
{ |
|
"epoch": 3.022488755622189, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00018455563603496185, |
|
"loss": 0.2721, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 3.028485757121439, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001841586555519458, |
|
"loss": 0.3042, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.0344827586206895, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00018376142256953167, |
|
"loss": 0.3035, |
|
"step": 2024 |
|
}, |
|
{ |
|
"epoch": 3.04047976011994, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00018336394002404954, |
|
"loss": 0.2887, |
|
"step": 2028 |
|
}, |
|
{ |
|
"epoch": 3.0464767616191906, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00018296621085367424, |
|
"loss": 0.2429, |
|
"step": 2032 |
|
}, |
|
{ |
|
"epoch": 3.052473763118441, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018256823799840376, |
|
"loss": 0.295, |
|
"step": 2036 |
|
}, |
|
{ |
|
"epoch": 3.058470764617691, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00018217002440003733, |
|
"loss": 0.2938, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.0644677661169415, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00018177157300215365, |
|
"loss": 0.2914, |
|
"step": 2044 |
|
}, |
|
{ |
|
"epoch": 3.070464767616192, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018137288675008938, |
|
"loss": 0.33, |
|
"step": 2048 |
|
}, |
|
{ |
|
"epoch": 3.076461769115442, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018097396859091715, |
|
"loss": 0.2802, |
|
"step": 2052 |
|
}, |
|
{ |
|
"epoch": 3.082458770614693, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00018057482147342379, |
|
"loss": 0.2736, |
|
"step": 2056 |
|
}, |
|
{ |
|
"epoch": 3.088455772113943, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0001801754483480887, |
|
"loss": 0.3102, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.0944527736131935, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001797758521670617, |
|
"loss": 0.3081, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 3.100449775112444, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00017937603588414177, |
|
"loss": 0.3164, |
|
"step": 2068 |
|
}, |
|
{ |
|
"epoch": 3.106446776611694, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00017897600245475454, |
|
"loss": 0.3019, |
|
"step": 2072 |
|
}, |
|
{ |
|
"epoch": 3.1124437781109444, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001785757548359309, |
|
"loss": 0.2853, |
|
"step": 2076 |
|
}, |
|
{ |
|
"epoch": 3.1184407796101947, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00017817529598628513, |
|
"loss": 0.2779, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.1244377811094455, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017777462886599276, |
|
"loss": 0.3017, |
|
"step": 2084 |
|
}, |
|
{ |
|
"epoch": 3.130434782608696, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00017737375643676895, |
|
"loss": 0.3012, |
|
"step": 2088 |
|
}, |
|
{ |
|
"epoch": 3.136431784107946, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001769726816618464, |
|
"loss": 0.2831, |
|
"step": 2092 |
|
}, |
|
{ |
|
"epoch": 3.1424287856071964, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00017657140750595366, |
|
"loss": 0.2922, |
|
"step": 2096 |
|
}, |
|
{ |
|
"epoch": 3.1484257871064467, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00017616993693529302, |
|
"loss": 0.3342, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.154422788605697, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00017576827291751864, |
|
"loss": 0.2842, |
|
"step": 2104 |
|
}, |
|
{ |
|
"epoch": 3.1604197901049473, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017536641842171472, |
|
"loss": 0.3514, |
|
"step": 2108 |
|
}, |
|
{ |
|
"epoch": 3.166416791604198, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0001749643764183734, |
|
"loss": 0.3121, |
|
"step": 2112 |
|
}, |
|
{ |
|
"epoch": 3.1724137931034484, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00017456214987937282, |
|
"loss": 0.3121, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 3.1784107946026987, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017415974177795534, |
|
"loss": 0.3049, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.184407796101949, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001737571550887053, |
|
"loss": 0.293, |
|
"step": 2124 |
|
}, |
|
{ |
|
"epoch": 3.1904047976011993, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00017335439278752727, |
|
"loss": 0.3108, |
|
"step": 2128 |
|
}, |
|
{ |
|
"epoch": 3.1964017991004496, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00017295145785162377, |
|
"loss": 0.2983, |
|
"step": 2132 |
|
}, |
|
{ |
|
"epoch": 3.2023988005997, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00017254835325947364, |
|
"loss": 0.3318, |
|
"step": 2136 |
|
}, |
|
{ |
|
"epoch": 3.2083958020989507, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00017214508199080953, |
|
"loss": 0.3164, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.214392803598201, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00017174164702659647, |
|
"loss": 0.3074, |
|
"step": 2144 |
|
}, |
|
{ |
|
"epoch": 3.2203898050974513, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00017133805134900926, |
|
"loss": 0.2884, |
|
"step": 2148 |
|
}, |
|
{ |
|
"epoch": 3.2263868065967016, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017093429794141094, |
|
"loss": 0.3038, |
|
"step": 2152 |
|
}, |
|
{ |
|
"epoch": 3.232383808095952, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00017053038978833018, |
|
"loss": 0.3217, |
|
"step": 2156 |
|
}, |
|
{ |
|
"epoch": 3.2383808095952022, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0001701263298754398, |
|
"loss": 0.3117, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.244377811094453, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00016972212118953426, |
|
"loss": 0.2811, |
|
"step": 2164 |
|
}, |
|
{ |
|
"epoch": 3.2503748125937033, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00016931776671850785, |
|
"loss": 0.2991, |
|
"step": 2168 |
|
}, |
|
{ |
|
"epoch": 3.2563718140929536, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00016891326945133237, |
|
"loss": 0.3019, |
|
"step": 2172 |
|
}, |
|
{ |
|
"epoch": 3.262368815592204, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00016850863237803527, |
|
"loss": 0.3305, |
|
"step": 2176 |
|
}, |
|
{ |
|
"epoch": 3.2683658170914542, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001681038584896774, |
|
"loss": 0.3355, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.2743628185907045, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001676989507783309, |
|
"loss": 0.3139, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 3.280359820089955, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00016729391223705727, |
|
"loss": 0.2821, |
|
"step": 2188 |
|
}, |
|
{ |
|
"epoch": 3.286356821589205, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001668887458598849, |
|
"loss": 0.2992, |
|
"step": 2192 |
|
}, |
|
{ |
|
"epoch": 3.292353823088456, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00016648345464178723, |
|
"loss": 0.3048, |
|
"step": 2196 |
|
}, |
|
{ |
|
"epoch": 3.2983508245877062, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00016607804157866066, |
|
"loss": 0.3044, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.3043478260869565, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00016567250966730197, |
|
"loss": 0.298, |
|
"step": 2204 |
|
}, |
|
{ |
|
"epoch": 3.310344827586207, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00016526686190538678, |
|
"loss": 0.2494, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 3.316341829085457, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00016486110129144675, |
|
"loss": 0.2682, |
|
"step": 2212 |
|
}, |
|
{ |
|
"epoch": 3.3223388305847075, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00016445523082484802, |
|
"loss": 0.3378, |
|
"step": 2216 |
|
}, |
|
{ |
|
"epoch": 3.3283358320839582, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00016404925350576858, |
|
"loss": 0.271, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.3343328335832085, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00016364317233517637, |
|
"loss": 0.326, |
|
"step": 2224 |
|
}, |
|
{ |
|
"epoch": 3.340329835082459, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00016323699031480686, |
|
"loss": 0.3056, |
|
"step": 2228 |
|
}, |
|
{ |
|
"epoch": 3.346326836581709, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00016283071044714123, |
|
"loss": 0.3266, |
|
"step": 2232 |
|
}, |
|
{ |
|
"epoch": 3.3523238380809595, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001624243357353837, |
|
"loss": 0.3001, |
|
"step": 2236 |
|
}, |
|
{ |
|
"epoch": 3.3583208395802098, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0001620178691834397, |
|
"loss": 0.3, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.36431784107946, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00016161131379589355, |
|
"loss": 0.3292, |
|
"step": 2244 |
|
}, |
|
{ |
|
"epoch": 3.370314842578711, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00016120467257798614, |
|
"loss": 0.3232, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 3.376311844077961, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.000160797948535593, |
|
"loss": 0.3401, |
|
"step": 2252 |
|
}, |
|
{ |
|
"epoch": 3.3823088455772115, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00016039114467520163, |
|
"loss": 0.2963, |
|
"step": 2256 |
|
}, |
|
{ |
|
"epoch": 3.3883058470764618, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00015998426400388977, |
|
"loss": 0.3083, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.394302848575712, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00015957730952930284, |
|
"loss": 0.3113, |
|
"step": 2264 |
|
}, |
|
{ |
|
"epoch": 3.4002998500749624, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00015917028425963185, |
|
"loss": 0.3149, |
|
"step": 2268 |
|
}, |
|
{ |
|
"epoch": 3.406296851574213, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001587631912035911, |
|
"loss": 0.315, |
|
"step": 2272 |
|
}, |
|
{ |
|
"epoch": 3.4122938530734634, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00015835603337039592, |
|
"loss": 0.2763, |
|
"step": 2276 |
|
}, |
|
{ |
|
"epoch": 3.4182908545727138, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00015794881376974054, |
|
"loss": 0.3223, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.424287856071964, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00015754153541177584, |
|
"loss": 0.2963, |
|
"step": 2284 |
|
}, |
|
{ |
|
"epoch": 3.4302848575712144, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00015713420130708682, |
|
"loss": 0.3092, |
|
"step": 2288 |
|
}, |
|
{ |
|
"epoch": 3.4362818590704647, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001567268144666708, |
|
"loss": 0.2752, |
|
"step": 2292 |
|
}, |
|
{ |
|
"epoch": 3.442278860569715, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00015631937790191468, |
|
"loss": 0.2993, |
|
"step": 2296 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00015591189462457313, |
|
"loss": 0.3338, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.454272863568216, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.000155504367646746, |
|
"loss": 0.322, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 3.4602698650674664, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00015509679998085618, |
|
"loss": 0.3167, |
|
"step": 2308 |
|
}, |
|
{ |
|
"epoch": 3.4662668665667167, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00015468919463962737, |
|
"loss": 0.3199, |
|
"step": 2312 |
|
}, |
|
{ |
|
"epoch": 3.472263868065967, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00015428155463606178, |
|
"loss": 0.312, |
|
"step": 2316 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00015387388298341767, |
|
"loss": 0.3105, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.4842578710644676, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00015346618269518753, |
|
"loss": 0.3061, |
|
"step": 2324 |
|
}, |
|
{ |
|
"epoch": 3.4902548725637184, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001530584567850753, |
|
"loss": 0.3315, |
|
"step": 2328 |
|
}, |
|
{ |
|
"epoch": 3.4962518740629687, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00015265070826697442, |
|
"loss": 0.2991, |
|
"step": 2332 |
|
}, |
|
{ |
|
"epoch": 3.502248875562219, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001522429401549454, |
|
"loss": 0.3368, |
|
"step": 2336 |
|
}, |
|
{ |
|
"epoch": 3.5082458770614693, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00015183515546319368, |
|
"loss": 0.3422, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.5142428785607196, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.000151427357206047, |
|
"loss": 0.3261, |
|
"step": 2344 |
|
}, |
|
{ |
|
"epoch": 3.52023988005997, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00015101954839793377, |
|
"loss": 0.3051, |
|
"step": 2348 |
|
}, |
|
{ |
|
"epoch": 3.52623688155922, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00015061173205336003, |
|
"loss": 0.3019, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 3.5322338830584705, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00015020391118688778, |
|
"loss": 0.3085, |
|
"step": 2356 |
|
}, |
|
{ |
|
"epoch": 3.5382308845577213, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00014979608881311222, |
|
"loss": 0.323, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.5442278860569716, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00014938826794663997, |
|
"loss": 0.3158, |
|
"step": 2364 |
|
}, |
|
{ |
|
"epoch": 3.550224887556222, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001489804516020662, |
|
"loss": 0.3029, |
|
"step": 2368 |
|
}, |
|
{ |
|
"epoch": 3.556221889055472, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.000148572642793953, |
|
"loss": 0.3353, |
|
"step": 2372 |
|
}, |
|
{ |
|
"epoch": 3.5622188905547225, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014816484453680635, |
|
"loss": 0.3086, |
|
"step": 2376 |
|
}, |
|
{ |
|
"epoch": 3.5682158920539733, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00014775705984505455, |
|
"loss": 0.3599, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.5742128935532236, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00014734929173302556, |
|
"loss": 0.2845, |
|
"step": 2384 |
|
}, |
|
{ |
|
"epoch": 3.580209895052474, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00014694154321492466, |
|
"loss": 0.3228, |
|
"step": 2388 |
|
}, |
|
{ |
|
"epoch": 3.586206896551724, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00014653381730481247, |
|
"loss": 0.347, |
|
"step": 2392 |
|
}, |
|
{ |
|
"epoch": 3.5922038980509745, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0001461261170165823, |
|
"loss": 0.3353, |
|
"step": 2396 |
|
}, |
|
{ |
|
"epoch": 3.598200899550225, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00014571844536393828, |
|
"loss": 0.3423, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.604197901049475, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00014531080536037263, |
|
"loss": 0.3268, |
|
"step": 2404 |
|
}, |
|
{ |
|
"epoch": 3.6101949025487254, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014490320001914384, |
|
"loss": 0.3282, |
|
"step": 2408 |
|
}, |
|
{ |
|
"epoch": 3.6161919040479757, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00014449563235325403, |
|
"loss": 0.3233, |
|
"step": 2412 |
|
}, |
|
{ |
|
"epoch": 3.6221889055472265, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0001440881053754269, |
|
"loss": 0.277, |
|
"step": 2416 |
|
}, |
|
{ |
|
"epoch": 3.628185907046477, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00014368062209808532, |
|
"loss": 0.34, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.634182908545727, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001432731855333292, |
|
"loss": 0.3308, |
|
"step": 2424 |
|
}, |
|
{ |
|
"epoch": 3.6401799100449774, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00014286579869291315, |
|
"loss": 0.3361, |
|
"step": 2428 |
|
}, |
|
{ |
|
"epoch": 3.6461769115442277, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00014245846458822416, |
|
"loss": 0.2908, |
|
"step": 2432 |
|
}, |
|
{ |
|
"epoch": 3.6521739130434785, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00014205118623025943, |
|
"loss": 0.3122, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 3.658170914542729, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00014164396662960408, |
|
"loss": 0.2552, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.664167916041979, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00014123680879640893, |
|
"loss": 0.3299, |
|
"step": 2444 |
|
}, |
|
{ |
|
"epoch": 3.6701649175412294, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014082971574036815, |
|
"loss": 0.3271, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 3.6761619190404797, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014042269047069718, |
|
"loss": 0.2984, |
|
"step": 2452 |
|
}, |
|
{ |
|
"epoch": 3.68215892053973, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00014001573599611026, |
|
"loss": 0.2954, |
|
"step": 2456 |
|
}, |
|
{ |
|
"epoch": 3.6881559220389803, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00013960885532479834, |
|
"loss": 0.3048, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.6941529235382307, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00013920205146440698, |
|
"loss": 0.3506, |
|
"step": 2464 |
|
}, |
|
{ |
|
"epoch": 3.7001499250374814, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00013879532742201378, |
|
"loss": 0.3517, |
|
"step": 2468 |
|
}, |
|
{ |
|
"epoch": 3.7061469265367317, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00013838868620410645, |
|
"loss": 0.288, |
|
"step": 2472 |
|
}, |
|
{ |
|
"epoch": 3.712143928035982, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00013798213081656026, |
|
"loss": 0.2907, |
|
"step": 2476 |
|
}, |
|
{ |
|
"epoch": 3.7181409295352323, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0001375756642646163, |
|
"loss": 0.329, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.7241379310344827, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00013716928955285874, |
|
"loss": 0.3179, |
|
"step": 2484 |
|
}, |
|
{ |
|
"epoch": 3.7301349325337334, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001367630096851931, |
|
"loss": 0.287, |
|
"step": 2488 |
|
}, |
|
{ |
|
"epoch": 3.7361319340329837, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00013635682766482363, |
|
"loss": 0.2958, |
|
"step": 2492 |
|
}, |
|
{ |
|
"epoch": 3.742128935532234, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00013595074649423144, |
|
"loss": 0.3526, |
|
"step": 2496 |
|
}, |
|
{ |
|
"epoch": 3.7481259370314843, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00013554476917515199, |
|
"loss": 0.2866, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.7541229385307346, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00013513889870855322, |
|
"loss": 0.335, |
|
"step": 2504 |
|
}, |
|
{ |
|
"epoch": 3.760119940029985, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00013473313809461324, |
|
"loss": 0.3568, |
|
"step": 2508 |
|
}, |
|
{ |
|
"epoch": 3.7661169415292353, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00013432749033269798, |
|
"loss": 0.3101, |
|
"step": 2512 |
|
}, |
|
{ |
|
"epoch": 3.7721139430284856, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00013392195842133934, |
|
"loss": 0.3066, |
|
"step": 2516 |
|
}, |
|
{ |
|
"epoch": 3.778110944527736, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00013351654535821275, |
|
"loss": 0.3164, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.7841079460269866, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00013311125414011511, |
|
"loss": 0.3246, |
|
"step": 2524 |
|
}, |
|
{ |
|
"epoch": 3.790104947526237, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00013270608776294276, |
|
"loss": 0.3198, |
|
"step": 2528 |
|
}, |
|
{ |
|
"epoch": 3.7961019490254873, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0001323010492216691, |
|
"loss": 0.3005, |
|
"step": 2532 |
|
}, |
|
{ |
|
"epoch": 3.8020989505247376, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001318961415103226, |
|
"loss": 0.305, |
|
"step": 2536 |
|
}, |
|
{ |
|
"epoch": 3.808095952023988, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00013149136762196474, |
|
"loss": 0.326, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.8140929535232386, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00013108673054866763, |
|
"loss": 0.3226, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 3.820089955022489, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0001306822332814921, |
|
"loss": 0.3224, |
|
"step": 2548 |
|
}, |
|
{ |
|
"epoch": 3.8260869565217392, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0001302778788104657, |
|
"loss": 0.295, |
|
"step": 2552 |
|
}, |
|
{ |
|
"epoch": 3.8320839580209896, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00012987367012456014, |
|
"loss": 0.3086, |
|
"step": 2556 |
|
}, |
|
{ |
|
"epoch": 3.83808095952024, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00012946961021166983, |
|
"loss": 0.3273, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.84407796101949, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00012906570205858906, |
|
"loss": 0.308, |
|
"step": 2564 |
|
}, |
|
{ |
|
"epoch": 3.8500749625187405, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00012866194865099074, |
|
"loss": 0.2829, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 3.856071964017991, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00012825835297340353, |
|
"loss": 0.3349, |
|
"step": 2572 |
|
}, |
|
{ |
|
"epoch": 3.862068965517241, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0001278549180091905, |
|
"loss": 0.3356, |
|
"step": 2576 |
|
}, |
|
{ |
|
"epoch": 3.868065967016492, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0001274516467405264, |
|
"loss": 0.3379, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.874062968515742, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00012704854214837618, |
|
"loss": 0.3108, |
|
"step": 2584 |
|
}, |
|
{ |
|
"epoch": 3.8800599700149925, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0001266456072124727, |
|
"loss": 0.3004, |
|
"step": 2588 |
|
}, |
|
{ |
|
"epoch": 3.886056971514243, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00012624284491129464, |
|
"loss": 0.304, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 3.892053973013493, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00012584025822204466, |
|
"loss": 0.2709, |
|
"step": 2596 |
|
}, |
|
{ |
|
"epoch": 3.898050974512744, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00012543785012062716, |
|
"loss": 0.2899, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.904047976011994, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00012503562358162664, |
|
"loss": 0.2571, |
|
"step": 2604 |
|
}, |
|
{ |
|
"epoch": 3.9100449775112445, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00012463358157828528, |
|
"loss": 0.3106, |
|
"step": 2608 |
|
}, |
|
{ |
|
"epoch": 3.9160419790104948, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00012423172708248136, |
|
"loss": 0.2812, |
|
"step": 2612 |
|
}, |
|
{ |
|
"epoch": 3.922038980509745, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.000123830063064707, |
|
"loss": 0.3079, |
|
"step": 2616 |
|
}, |
|
{ |
|
"epoch": 3.9280359820089954, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00012342859249404636, |
|
"loss": 0.3603, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.9340329835082457, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0001230273183381536, |
|
"loss": 0.3429, |
|
"step": 2624 |
|
}, |
|
{ |
|
"epoch": 3.940029985007496, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00012262624356323105, |
|
"loss": 0.3389, |
|
"step": 2628 |
|
}, |
|
{ |
|
"epoch": 3.9460269865067468, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00012222537113400724, |
|
"loss": 0.3027, |
|
"step": 2632 |
|
}, |
|
{ |
|
"epoch": 3.952023988005997, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00012182470401371487, |
|
"loss": 0.3059, |
|
"step": 2636 |
|
}, |
|
{ |
|
"epoch": 3.9580209895052474, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001214242451640691, |
|
"loss": 0.3146, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.9640179910044977, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00012102399754524547, |
|
"loss": 0.3037, |
|
"step": 2644 |
|
}, |
|
{ |
|
"epoch": 3.970014992503748, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00012062396411585825, |
|
"loss": 0.354, |
|
"step": 2648 |
|
}, |
|
{ |
|
"epoch": 3.9760119940029988, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00012022414783293825, |
|
"loss": 0.2754, |
|
"step": 2652 |
|
}, |
|
{ |
|
"epoch": 3.982008995502249, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00011982455165191132, |
|
"loss": 0.3144, |
|
"step": 2656 |
|
}, |
|
{ |
|
"epoch": 3.9880059970014994, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00011942517852657619, |
|
"loss": 0.3208, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.9940029985007497, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00011902603140908281, |
|
"loss": 0.3026, |
|
"step": 2664 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00011862711324991058, |
|
"loss": 0.2802, |
|
"step": 2668 |
|
}, |
|
{ |
|
"epoch": 4.00599700149925, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00011822842699784631, |
|
"loss": 0.2288, |
|
"step": 2672 |
|
}, |
|
{ |
|
"epoch": 4.011994002998501, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00011782997559996267, |
|
"loss": 0.2148, |
|
"step": 2676 |
|
}, |
|
{ |
|
"epoch": 4.017991004497751, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00011743176200159619, |
|
"loss": 0.2308, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.023988005997001, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00011703378914632574, |
|
"loss": 0.2583, |
|
"step": 2684 |
|
}, |
|
{ |
|
"epoch": 4.0299850074962515, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00011663605997595045, |
|
"loss": 0.2436, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 4.035982008995502, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00011623857743046834, |
|
"loss": 0.2802, |
|
"step": 2692 |
|
}, |
|
{ |
|
"epoch": 4.041979010494753, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00011584134444805418, |
|
"loss": 0.2094, |
|
"step": 2696 |
|
}, |
|
{ |
|
"epoch": 4.047976011994003, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00011544436396503816, |
|
"loss": 0.1985, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.053973013493254, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00011504763891588389, |
|
"loss": 0.2294, |
|
"step": 2704 |
|
}, |
|
{ |
|
"epoch": 4.059970014992504, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00011465117223316685, |
|
"loss": 0.2212, |
|
"step": 2708 |
|
}, |
|
{ |
|
"epoch": 4.065967016491754, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00011425496684755278, |
|
"loss": 0.2316, |
|
"step": 2712 |
|
}, |
|
{ |
|
"epoch": 4.071964017991005, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00011385902568777574, |
|
"loss": 0.2127, |
|
"step": 2716 |
|
}, |
|
{ |
|
"epoch": 4.077961019490255, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00011346335168061682, |
|
"loss": 0.2041, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.083958020989505, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00011306794775088218, |
|
"loss": 0.2162, |
|
"step": 2724 |
|
}, |
|
{ |
|
"epoch": 4.0899550224887555, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00011267281682138175, |
|
"loss": 0.223, |
|
"step": 2728 |
|
}, |
|
{ |
|
"epoch": 4.095952023988006, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00011227796181290724, |
|
"loss": 0.2364, |
|
"step": 2732 |
|
}, |
|
{ |
|
"epoch": 4.101949025487256, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00011188338564421098, |
|
"loss": 0.2462, |
|
"step": 2736 |
|
}, |
|
{ |
|
"epoch": 4.1079460269865065, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00011148909123198395, |
|
"loss": 0.2335, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.113943028485757, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00011109508149083453, |
|
"loss": 0.2305, |
|
"step": 2744 |
|
}, |
|
{ |
|
"epoch": 4.119940029985007, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00011070135933326671, |
|
"loss": 0.2231, |
|
"step": 2748 |
|
}, |
|
{ |
|
"epoch": 4.125937031484258, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0001103079276696587, |
|
"loss": 0.2242, |
|
"step": 2752 |
|
}, |
|
{ |
|
"epoch": 4.131934032983509, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0001099147894082416, |
|
"loss": 0.2473, |
|
"step": 2756 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00010952194745507728, |
|
"loss": 0.2219, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.143928035982009, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00010912940471403777, |
|
"loss": 0.1971, |
|
"step": 2764 |
|
}, |
|
{ |
|
"epoch": 4.1499250374812595, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00010873716408678288, |
|
"loss": 0.2007, |
|
"step": 2768 |
|
}, |
|
{ |
|
"epoch": 4.15592203898051, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00010834522847273966, |
|
"loss": 0.23, |
|
"step": 2772 |
|
}, |
|
{ |
|
"epoch": 4.16191904047976, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0001079536007690801, |
|
"loss": 0.2149, |
|
"step": 2776 |
|
}, |
|
{ |
|
"epoch": 4.1679160419790104, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00010756228387070046, |
|
"loss": 0.2343, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.173913043478261, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00010717128067019929, |
|
"loss": 0.2125, |
|
"step": 2784 |
|
}, |
|
{ |
|
"epoch": 4.179910044977511, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00010678059405785647, |
|
"loss": 0.194, |
|
"step": 2788 |
|
}, |
|
{ |
|
"epoch": 4.185907046476761, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00010639022692161167, |
|
"loss": 0.2039, |
|
"step": 2792 |
|
}, |
|
{ |
|
"epoch": 4.191904047976012, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00010600018214704283, |
|
"loss": 0.2133, |
|
"step": 2796 |
|
}, |
|
{ |
|
"epoch": 4.197901049475262, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00010561046261734522, |
|
"loss": 0.2073, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.203898050974512, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00010522107121330975, |
|
"loss": 0.2046, |
|
"step": 2804 |
|
}, |
|
{ |
|
"epoch": 4.2098950524737635, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00010483201081330194, |
|
"loss": 0.2083, |
|
"step": 2808 |
|
}, |
|
{ |
|
"epoch": 4.215892053973014, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00010444328429324048, |
|
"loss": 0.2455, |
|
"step": 2812 |
|
}, |
|
{ |
|
"epoch": 4.221889055472264, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001040548945265761, |
|
"loss": 0.2274, |
|
"step": 2816 |
|
}, |
|
{ |
|
"epoch": 4.227886056971514, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00010366684438427018, |
|
"loss": 0.2318, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.233883058470765, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0001032791367347737, |
|
"loss": 0.2193, |
|
"step": 2824 |
|
}, |
|
{ |
|
"epoch": 4.239880059970015, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00010289177444400583, |
|
"loss": 0.2116, |
|
"step": 2828 |
|
}, |
|
{ |
|
"epoch": 4.245877061469265, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00010250476037533299, |
|
"loss": 0.222, |
|
"step": 2832 |
|
}, |
|
{ |
|
"epoch": 4.251874062968516, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00010211809738954748, |
|
"loss": 0.1968, |
|
"step": 2836 |
|
}, |
|
{ |
|
"epoch": 4.257871064467766, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00010173178834484643, |
|
"loss": 0.235, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.263868065967016, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00010134583609681065, |
|
"loss": 0.2511, |
|
"step": 2844 |
|
}, |
|
{ |
|
"epoch": 4.269865067466267, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00010096024349838352, |
|
"loss": 0.2757, |
|
"step": 2848 |
|
}, |
|
{ |
|
"epoch": 4.275862068965517, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001005750133998499, |
|
"loss": 0.2311, |
|
"step": 2852 |
|
}, |
|
{ |
|
"epoch": 4.281859070464767, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00010019014864881507, |
|
"loss": 0.2427, |
|
"step": 2856 |
|
}, |
|
{ |
|
"epoch": 4.287856071964018, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 9.980565209018374e-05, |
|
"loss": 0.2064, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.293853073463269, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 9.942152656613876e-05, |
|
"loss": 0.2334, |
|
"step": 2864 |
|
}, |
|
{ |
|
"epoch": 4.299850074962519, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 9.903777491612056e-05, |
|
"loss": 0.1884, |
|
"step": 2868 |
|
}, |
|
{ |
|
"epoch": 4.305847076461769, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 9.865439997680582e-05, |
|
"loss": 0.2225, |
|
"step": 2872 |
|
}, |
|
{ |
|
"epoch": 4.31184407796102, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 9.827140458208643e-05, |
|
"loss": 0.225, |
|
"step": 2876 |
|
}, |
|
{ |
|
"epoch": 4.31784107946027, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 9.788879156304896e-05, |
|
"loss": 0.2365, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.32383808095952, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 9.750656374795327e-05, |
|
"loss": 0.2335, |
|
"step": 2884 |
|
}, |
|
{ |
|
"epoch": 4.329835082458771, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.712472396221193e-05, |
|
"loss": 0.2408, |
|
"step": 2888 |
|
}, |
|
{ |
|
"epoch": 4.335832083958021, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 9.674327502836913e-05, |
|
"loss": 0.257, |
|
"step": 2892 |
|
}, |
|
{ |
|
"epoch": 4.341829085457271, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 9.636221976607995e-05, |
|
"loss": 0.1954, |
|
"step": 2896 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.598156099208947e-05, |
|
"loss": 0.2215, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.353823088455772, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 9.560130152021191e-05, |
|
"loss": 0.2466, |
|
"step": 2904 |
|
}, |
|
{ |
|
"epoch": 4.359820089955022, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 9.522144416130987e-05, |
|
"loss": 0.2279, |
|
"step": 2908 |
|
}, |
|
{ |
|
"epoch": 4.365817091454273, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 9.484199172327358e-05, |
|
"loss": 0.205, |
|
"step": 2912 |
|
}, |
|
{ |
|
"epoch": 4.371814092953524, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.446294701100011e-05, |
|
"loss": 0.2063, |
|
"step": 2916 |
|
}, |
|
{ |
|
"epoch": 4.377811094452774, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 9.408431282637256e-05, |
|
"loss": 0.2412, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.383808095952024, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 9.370609196823956e-05, |
|
"loss": 0.254, |
|
"step": 2924 |
|
}, |
|
{ |
|
"epoch": 4.389805097451275, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.33282872323943e-05, |
|
"loss": 0.2525, |
|
"step": 2928 |
|
}, |
|
{ |
|
"epoch": 4.395802098950525, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 9.295090141155415e-05, |
|
"loss": 0.2227, |
|
"step": 2932 |
|
}, |
|
{ |
|
"epoch": 4.401799100449775, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 9.257393729533971e-05, |
|
"loss": 0.2173, |
|
"step": 2936 |
|
}, |
|
{ |
|
"epoch": 4.4077961019490255, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 9.219739767025461e-05, |
|
"loss": 0.2299, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.413793103448276, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 9.182128531966434e-05, |
|
"loss": 0.2214, |
|
"step": 2944 |
|
}, |
|
{ |
|
"epoch": 4.419790104947526, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 9.144560302377629e-05, |
|
"loss": 0.2443, |
|
"step": 2948 |
|
}, |
|
{ |
|
"epoch": 4.425787106446776, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.107035355961867e-05, |
|
"loss": 0.205, |
|
"step": 2952 |
|
}, |
|
{ |
|
"epoch": 4.431784107946027, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 9.069553970102035e-05, |
|
"loss": 0.2666, |
|
"step": 2956 |
|
}, |
|
{ |
|
"epoch": 4.437781109445277, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.03211642185903e-05, |
|
"loss": 0.1858, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.443778110944527, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 8.994722987969674e-05, |
|
"loss": 0.2402, |
|
"step": 2964 |
|
}, |
|
{ |
|
"epoch": 4.449775112443778, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 8.957373944844733e-05, |
|
"loss": 0.2283, |
|
"step": 2968 |
|
}, |
|
{ |
|
"epoch": 4.455772113943029, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.920069568566804e-05, |
|
"loss": 0.2357, |
|
"step": 2972 |
|
}, |
|
{ |
|
"epoch": 4.461769115442279, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 8.882810134888341e-05, |
|
"loss": 0.2099, |
|
"step": 2976 |
|
}, |
|
{ |
|
"epoch": 4.4677661169415295, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 8.845595919229552e-05, |
|
"loss": 0.2315, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.47376311844078, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 8.808427196676429e-05, |
|
"loss": 0.2123, |
|
"step": 2984 |
|
}, |
|
{ |
|
"epoch": 4.47976011994003, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.771304241978647e-05, |
|
"loss": 0.2223, |
|
"step": 2988 |
|
}, |
|
{ |
|
"epoch": 4.48575712143928, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.734227329547592e-05, |
|
"loss": 0.1933, |
|
"step": 2992 |
|
}, |
|
{ |
|
"epoch": 4.491754122938531, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 8.697196733454305e-05, |
|
"loss": 0.2669, |
|
"step": 2996 |
|
}, |
|
{ |
|
"epoch": 4.497751124437781, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.660212727427438e-05, |
|
"loss": 0.2182, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.503748125937031, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 8.623275584851283e-05, |
|
"loss": 0.2159, |
|
"step": 3004 |
|
}, |
|
{ |
|
"epoch": 4.509745127436282, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 8.58638557876368e-05, |
|
"loss": 0.233, |
|
"step": 3008 |
|
}, |
|
{ |
|
"epoch": 4.515742128935532, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.549542981854078e-05, |
|
"loss": 0.2061, |
|
"step": 3012 |
|
}, |
|
{ |
|
"epoch": 4.521739130434782, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 8.512748066461446e-05, |
|
"loss": 0.2279, |
|
"step": 3016 |
|
}, |
|
{ |
|
"epoch": 4.527736131934033, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.47600110457233e-05, |
|
"loss": 0.2274, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.533733133433284, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 8.439302367818756e-05, |
|
"loss": 0.2008, |
|
"step": 3024 |
|
}, |
|
{ |
|
"epoch": 4.539730134932534, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 8.40265212747632e-05, |
|
"loss": 0.2722, |
|
"step": 3028 |
|
}, |
|
{ |
|
"epoch": 4.545727136431784, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 8.366050654462102e-05, |
|
"loss": 0.2094, |
|
"step": 3032 |
|
}, |
|
{ |
|
"epoch": 4.551724137931035, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.329498219332716e-05, |
|
"loss": 0.204, |
|
"step": 3036 |
|
}, |
|
{ |
|
"epoch": 4.557721139430285, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 8.29299509228228e-05, |
|
"loss": 0.2176, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.563718140929535, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.256541543140424e-05, |
|
"loss": 0.2103, |
|
"step": 3044 |
|
}, |
|
{ |
|
"epoch": 4.569715142428786, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.220137841370316e-05, |
|
"loss": 0.2291, |
|
"step": 3048 |
|
}, |
|
{ |
|
"epoch": 4.575712143928036, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 8.183784256066643e-05, |
|
"loss": 0.2572, |
|
"step": 3052 |
|
}, |
|
{ |
|
"epoch": 4.581709145427286, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.147481055953629e-05, |
|
"loss": 0.1984, |
|
"step": 3056 |
|
}, |
|
{ |
|
"epoch": 4.5877061469265366, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 8.111228509383057e-05, |
|
"loss": 0.2025, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.593703148425787, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.075026884332297e-05, |
|
"loss": 0.228, |
|
"step": 3064 |
|
}, |
|
{ |
|
"epoch": 4.599700149925037, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.038876448402282e-05, |
|
"loss": 0.2427, |
|
"step": 3068 |
|
}, |
|
{ |
|
"epoch": 4.6056971514242875, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 8.002777468815569e-05, |
|
"loss": 0.2203, |
|
"step": 3072 |
|
}, |
|
{ |
|
"epoch": 4.611694152923539, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 7.966730212414362e-05, |
|
"loss": 0.2291, |
|
"step": 3076 |
|
}, |
|
{ |
|
"epoch": 4.617691154422789, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 7.930734945658519e-05, |
|
"loss": 0.2482, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.623688155922039, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.894791934623587e-05, |
|
"loss": 0.2045, |
|
"step": 3084 |
|
}, |
|
{ |
|
"epoch": 4.62968515742129, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 7.858901444998846e-05, |
|
"loss": 0.2065, |
|
"step": 3088 |
|
}, |
|
{ |
|
"epoch": 4.63568215892054, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 7.82306374208535e-05, |
|
"loss": 0.2197, |
|
"step": 3092 |
|
}, |
|
{ |
|
"epoch": 4.64167916041979, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 7.787279090793946e-05, |
|
"loss": 0.2139, |
|
"step": 3096 |
|
}, |
|
{ |
|
"epoch": 4.6476761619190405, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.751547755643325e-05, |
|
"loss": 0.2555, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.653673163418291, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 7.715870000758061e-05, |
|
"loss": 0.2481, |
|
"step": 3104 |
|
}, |
|
{ |
|
"epoch": 4.659670164917541, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.680246089866683e-05, |
|
"loss": 0.22, |
|
"step": 3108 |
|
}, |
|
{ |
|
"epoch": 4.6656671664167915, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 7.644676286299698e-05, |
|
"loss": 0.2324, |
|
"step": 3112 |
|
}, |
|
{ |
|
"epoch": 4.671664167916042, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.609160852987643e-05, |
|
"loss": 0.2062, |
|
"step": 3116 |
|
}, |
|
{ |
|
"epoch": 4.677661169415292, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 7.573700052459173e-05, |
|
"loss": 0.2048, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 4.683658170914542, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.53829414683908e-05, |
|
"loss": 0.2774, |
|
"step": 3124 |
|
}, |
|
{ |
|
"epoch": 4.689655172413794, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 7.5029433978464e-05, |
|
"loss": 0.2455, |
|
"step": 3128 |
|
}, |
|
{ |
|
"epoch": 4.695652173913043, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 7.467648066792415e-05, |
|
"loss": 0.2411, |
|
"step": 3132 |
|
}, |
|
{ |
|
"epoch": 4.701649175412294, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 7.432408414578798e-05, |
|
"loss": 0.2107, |
|
"step": 3136 |
|
}, |
|
{ |
|
"epoch": 4.7076461769115445, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 7.397224701695622e-05, |
|
"loss": 0.2526, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 4.713643178410795, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 7.362097188219476e-05, |
|
"loss": 0.2119, |
|
"step": 3144 |
|
}, |
|
{ |
|
"epoch": 4.719640179910045, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.327026133811515e-05, |
|
"loss": 0.2031, |
|
"step": 3148 |
|
}, |
|
{ |
|
"epoch": 4.7256371814092955, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.292011797715548e-05, |
|
"loss": 0.2313, |
|
"step": 3152 |
|
}, |
|
{ |
|
"epoch": 4.731634182908546, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 7.257054438756125e-05, |
|
"loss": 0.2361, |
|
"step": 3156 |
|
}, |
|
{ |
|
"epoch": 4.737631184407796, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 7.222154315336641e-05, |
|
"loss": 0.2032, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 4.743628185907046, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.187311685437385e-05, |
|
"loss": 0.249, |
|
"step": 3164 |
|
}, |
|
{ |
|
"epoch": 4.749625187406297, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 7.152526806613663e-05, |
|
"loss": 0.2215, |
|
"step": 3168 |
|
}, |
|
{ |
|
"epoch": 4.755622188905547, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 7.1177999359939e-05, |
|
"loss": 0.229, |
|
"step": 3172 |
|
}, |
|
{ |
|
"epoch": 4.761619190404797, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 7.083131330277711e-05, |
|
"loss": 0.2435, |
|
"step": 3176 |
|
}, |
|
{ |
|
"epoch": 4.767616191904048, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 7.048521245734027e-05, |
|
"loss": 0.217, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 4.773613193403298, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 7.013969938199183e-05, |
|
"loss": 0.2311, |
|
"step": 3184 |
|
}, |
|
{ |
|
"epoch": 4.779610194902549, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 6.979477663075056e-05, |
|
"loss": 0.2059, |
|
"step": 3188 |
|
}, |
|
{ |
|
"epoch": 4.785607196401799, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 6.945044675327143e-05, |
|
"loss": 0.2165, |
|
"step": 3192 |
|
}, |
|
{ |
|
"epoch": 4.79160419790105, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 6.910671229482687e-05, |
|
"loss": 0.2198, |
|
"step": 3196 |
|
}, |
|
{ |
|
"epoch": 4.7976011994003, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.87635757962882e-05, |
|
"loss": 0.2366, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.80359820089955, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 6.842103979410638e-05, |
|
"loss": 0.1956, |
|
"step": 3204 |
|
}, |
|
{ |
|
"epoch": 4.809595202398801, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 6.807910682029387e-05, |
|
"loss": 0.2408, |
|
"step": 3208 |
|
}, |
|
{ |
|
"epoch": 4.815592203898051, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 6.77377794024051e-05, |
|
"loss": 0.2167, |
|
"step": 3212 |
|
}, |
|
{ |
|
"epoch": 4.821589205397301, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 6.739706006351873e-05, |
|
"loss": 0.2393, |
|
"step": 3216 |
|
}, |
|
{ |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.705695132221815e-05, |
|
"loss": 0.2243, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 4.833583208395802, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 6.671745569257357e-05, |
|
"loss": 0.1713, |
|
"step": 3224 |
|
}, |
|
{ |
|
"epoch": 4.839580209895052, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 6.637857568412272e-05, |
|
"loss": 0.2091, |
|
"step": 3228 |
|
}, |
|
{ |
|
"epoch": 4.8455772113943025, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.604031380185308e-05, |
|
"loss": 0.2094, |
|
"step": 3232 |
|
}, |
|
{ |
|
"epoch": 4.851574212893553, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 6.570267254618266e-05, |
|
"loss": 0.2276, |
|
"step": 3236 |
|
}, |
|
{ |
|
"epoch": 4.857571214392804, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 6.536565441294204e-05, |
|
"loss": 0.2228, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 4.863568215892054, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 6.502926189335556e-05, |
|
"loss": 0.2045, |
|
"step": 3244 |
|
}, |
|
{ |
|
"epoch": 4.869565217391305, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.469349747402306e-05, |
|
"loss": 0.2182, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 4.875562218890555, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.43583636369016e-05, |
|
"loss": 0.2321, |
|
"step": 3252 |
|
}, |
|
{ |
|
"epoch": 4.881559220389805, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 6.402386285928692e-05, |
|
"loss": 0.2101, |
|
"step": 3256 |
|
}, |
|
{ |
|
"epoch": 4.887556221889056, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.368999761379517e-05, |
|
"loss": 0.2074, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 4.893553223388306, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 6.33567703683447e-05, |
|
"loss": 0.1884, |
|
"step": 3264 |
|
}, |
|
{ |
|
"epoch": 4.899550224887556, |
|
"grad_norm": 0.375, |
|
"learning_rate": 6.302418358613792e-05, |
|
"loss": 0.2224, |
|
"step": 3268 |
|
}, |
|
{ |
|
"epoch": 4.9055472263868065, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 6.269223972564277e-05, |
|
"loss": 0.2385, |
|
"step": 3272 |
|
}, |
|
{ |
|
"epoch": 4.911544227886057, |
|
"grad_norm": 0.375, |
|
"learning_rate": 6.236094124057479e-05, |
|
"loss": 0.2544, |
|
"step": 3276 |
|
}, |
|
{ |
|
"epoch": 4.917541229385307, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 6.203029057987905e-05, |
|
"loss": 0.2074, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 4.923538230884557, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 6.17002901877118e-05, |
|
"loss": 0.1999, |
|
"step": 3284 |
|
}, |
|
{ |
|
"epoch": 4.929535232383808, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 6.137094250342257e-05, |
|
"loss": 0.2107, |
|
"step": 3288 |
|
}, |
|
{ |
|
"epoch": 4.935532233883059, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 6.104224996153605e-05, |
|
"loss": 0.2358, |
|
"step": 3292 |
|
}, |
|
{ |
|
"epoch": 4.941529235382308, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 6.0714214991734276e-05, |
|
"loss": 0.2614, |
|
"step": 3296 |
|
}, |
|
{ |
|
"epoch": 4.94752623688156, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 6.038684001883845e-05, |
|
"loss": 0.2255, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.95352323838081, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 6.0060127462791065e-05, |
|
"loss": 0.2323, |
|
"step": 3304 |
|
}, |
|
{ |
|
"epoch": 4.95952023988006, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 5.9734079738638064e-05, |
|
"loss": 0.2579, |
|
"step": 3308 |
|
}, |
|
{ |
|
"epoch": 4.9655172413793105, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 5.9408699256511124e-05, |
|
"loss": 0.2105, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 4.971514242878561, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 5.9083988421609544e-05, |
|
"loss": 0.222, |
|
"step": 3316 |
|
}, |
|
{ |
|
"epoch": 4.977511244377811, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.875994963418259e-05, |
|
"loss": 0.2258, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 4.983508245877061, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 5.8436585289511966e-05, |
|
"loss": 0.2295, |
|
"step": 3324 |
|
}, |
|
{ |
|
"epoch": 4.989505247376312, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.811389777789372e-05, |
|
"loss": 0.2235, |
|
"step": 3328 |
|
}, |
|
{ |
|
"epoch": 4.995502248875562, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 5.779188948462099e-05, |
|
"loss": 0.2327, |
|
"step": 3332 |
|
}, |
|
{ |
|
"epoch": 5.001499250374812, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 5.747056278996586e-05, |
|
"loss": 0.2092, |
|
"step": 3336 |
|
}, |
|
{ |
|
"epoch": 5.007496251874063, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 5.714992006916236e-05, |
|
"loss": 0.207, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 5.013493253373313, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.682996369238843e-05, |
|
"loss": 0.1806, |
|
"step": 3344 |
|
}, |
|
{ |
|
"epoch": 5.019490254872563, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 5.6510696024748734e-05, |
|
"loss": 0.188, |
|
"step": 3348 |
|
}, |
|
{ |
|
"epoch": 5.0254872563718145, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 5.619211942625687e-05, |
|
"loss": 0.1713, |
|
"step": 3352 |
|
}, |
|
{ |
|
"epoch": 5.031484257871065, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 5.5874236251818124e-05, |
|
"loss": 0.1948, |
|
"step": 3356 |
|
}, |
|
{ |
|
"epoch": 5.037481259370315, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 5.555704885121213e-05, |
|
"loss": 0.1687, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 5.043478260869565, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.5240559569075246e-05, |
|
"loss": 0.1593, |
|
"step": 3364 |
|
}, |
|
{ |
|
"epoch": 5.049475262368816, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 5.4924770744883434e-05, |
|
"loss": 0.1625, |
|
"step": 3368 |
|
}, |
|
{ |
|
"epoch": 5.055472263868066, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 5.4609684712934855e-05, |
|
"loss": 0.2044, |
|
"step": 3372 |
|
}, |
|
{ |
|
"epoch": 5.061469265367316, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 5.4295303802332786e-05, |
|
"loss": 0.201, |
|
"step": 3376 |
|
}, |
|
{ |
|
"epoch": 5.067466266866567, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 5.3981630336968104e-05, |
|
"loss": 0.1713, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 5.073463268365817, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.3668666635502397e-05, |
|
"loss": 0.1783, |
|
"step": 3384 |
|
}, |
|
{ |
|
"epoch": 5.079460269865067, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.3356415011350605e-05, |
|
"loss": 0.2147, |
|
"step": 3388 |
|
}, |
|
{ |
|
"epoch": 5.085457271364318, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.304487777266418e-05, |
|
"loss": 0.1921, |
|
"step": 3392 |
|
}, |
|
{ |
|
"epoch": 5.091454272863568, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 5.2734057222313714e-05, |
|
"loss": 0.1801, |
|
"step": 3396 |
|
}, |
|
{ |
|
"epoch": 5.097451274362818, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 5.242395565787209e-05, |
|
"loss": 0.2036, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.103448275862069, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 5.211457537159761e-05, |
|
"loss": 0.1686, |
|
"step": 3404 |
|
}, |
|
{ |
|
"epoch": 5.10944527736132, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 5.1805918650416706e-05, |
|
"loss": 0.2032, |
|
"step": 3408 |
|
}, |
|
{ |
|
"epoch": 5.11544227886057, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 5.1497987775907514e-05, |
|
"loss": 0.1512, |
|
"step": 3412 |
|
}, |
|
{ |
|
"epoch": 5.12143928035982, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 5.1190785024282385e-05, |
|
"loss": 0.1644, |
|
"step": 3416 |
|
}, |
|
{ |
|
"epoch": 5.127436281859071, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 5.088431266637177e-05, |
|
"loss": 0.1709, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 5.133433283358321, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 5.05785729676068e-05, |
|
"loss": 0.1417, |
|
"step": 3424 |
|
}, |
|
{ |
|
"epoch": 5.139430284857571, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 5.027356818800312e-05, |
|
"loss": 0.1518, |
|
"step": 3428 |
|
}, |
|
{ |
|
"epoch": 5.145427286356822, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 4.996930058214351e-05, |
|
"loss": 0.1861, |
|
"step": 3432 |
|
}, |
|
{ |
|
"epoch": 5.151424287856072, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 4.96657723991619e-05, |
|
"loss": 0.1766, |
|
"step": 3436 |
|
}, |
|
{ |
|
"epoch": 5.157421289355322, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 4.936298588272626e-05, |
|
"loss": 0.1931, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 5.1634182908545725, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.906094327102233e-05, |
|
"loss": 0.1589, |
|
"step": 3444 |
|
}, |
|
{ |
|
"epoch": 5.169415292353823, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 4.8759646796736814e-05, |
|
"loss": 0.1664, |
|
"step": 3448 |
|
}, |
|
{ |
|
"epoch": 5.175412293853073, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.845909868704102e-05, |
|
"loss": 0.1806, |
|
"step": 3452 |
|
}, |
|
{ |
|
"epoch": 5.181409295352323, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 4.815930116357448e-05, |
|
"loss": 0.1722, |
|
"step": 3456 |
|
}, |
|
{ |
|
"epoch": 5.187406296851575, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 4.786025644242828e-05, |
|
"loss": 0.1689, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 5.193403298350825, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 4.756196673412891e-05, |
|
"loss": 0.1683, |
|
"step": 3464 |
|
}, |
|
{ |
|
"epoch": 5.199400299850075, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 4.726443424362174e-05, |
|
"loss": 0.1673, |
|
"step": 3468 |
|
}, |
|
{ |
|
"epoch": 5.2053973013493255, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 4.696766117025499e-05, |
|
"loss": 0.1806, |
|
"step": 3472 |
|
}, |
|
{ |
|
"epoch": 5.211394302848576, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 4.667164970776316e-05, |
|
"loss": 0.1878, |
|
"step": 3476 |
|
}, |
|
{ |
|
"epoch": 5.217391304347826, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.637640204425095e-05, |
|
"loss": 0.1947, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 5.2233883058470765, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 4.608192036217719e-05, |
|
"loss": 0.1852, |
|
"step": 3484 |
|
}, |
|
{ |
|
"epoch": 5.229385307346327, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 4.5788206838338526e-05, |
|
"loss": 0.1878, |
|
"step": 3488 |
|
}, |
|
{ |
|
"epoch": 5.235382308845577, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 4.5495263643853396e-05, |
|
"loss": 0.1675, |
|
"step": 3492 |
|
}, |
|
{ |
|
"epoch": 5.241379310344827, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 4.520309294414603e-05, |
|
"loss": 0.1613, |
|
"step": 3496 |
|
}, |
|
{ |
|
"epoch": 5.247376311844078, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 4.491169689893045e-05, |
|
"loss": 0.1876, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.253373313343328, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 4.462107766219441e-05, |
|
"loss": 0.1874, |
|
"step": 3504 |
|
}, |
|
{ |
|
"epoch": 5.259370314842578, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 4.4331237382183496e-05, |
|
"loss": 0.1597, |
|
"step": 3508 |
|
}, |
|
{ |
|
"epoch": 5.265367316341829, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 4.4042178201385305e-05, |
|
"loss": 0.2056, |
|
"step": 3512 |
|
}, |
|
{ |
|
"epoch": 5.27136431784108, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 4.375390225651366e-05, |
|
"loss": 0.1552, |
|
"step": 3516 |
|
}, |
|
{ |
|
"epoch": 5.27736131934033, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 4.346641167849264e-05, |
|
"loss": 0.1765, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 5.2833583208395805, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 4.31797085924409e-05, |
|
"loss": 0.1917, |
|
"step": 3524 |
|
}, |
|
{ |
|
"epoch": 5.289355322338831, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 4.2893795117656135e-05, |
|
"loss": 0.1761, |
|
"step": 3528 |
|
}, |
|
{ |
|
"epoch": 5.295352323838081, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 4.260867336759905e-05, |
|
"loss": 0.1688, |
|
"step": 3532 |
|
}, |
|
{ |
|
"epoch": 5.301349325337331, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 4.232434544987825e-05, |
|
"loss": 0.1692, |
|
"step": 3536 |
|
}, |
|
{ |
|
"epoch": 5.307346326836582, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 4.2040813466233966e-05, |
|
"loss": 0.1563, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 5.313343328335832, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 4.17580795125233e-05, |
|
"loss": 0.1555, |
|
"step": 3544 |
|
}, |
|
{ |
|
"epoch": 5.319340329835082, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 4.1476145678704066e-05, |
|
"loss": 0.1778, |
|
"step": 3548 |
|
}, |
|
{ |
|
"epoch": 5.325337331334333, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 4.119501404881986e-05, |
|
"loss": 0.1586, |
|
"step": 3552 |
|
}, |
|
{ |
|
"epoch": 5.331334332833583, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 4.091468670098424e-05, |
|
"loss": 0.1762, |
|
"step": 3556 |
|
}, |
|
{ |
|
"epoch": 5.337331334332833, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 4.063516570736558e-05, |
|
"loss": 0.1775, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 5.3433283358320836, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 4.0356453134171805e-05, |
|
"loss": 0.1796, |
|
"step": 3564 |
|
}, |
|
{ |
|
"epoch": 5.349325337331335, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.007855104163492e-05, |
|
"loss": 0.1778, |
|
"step": 3568 |
|
}, |
|
{ |
|
"epoch": 5.355322338830585, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 3.980146148399597e-05, |
|
"loss": 0.164, |
|
"step": 3572 |
|
}, |
|
{ |
|
"epoch": 5.361319340329835, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 3.952518650948966e-05, |
|
"loss": 0.1757, |
|
"step": 3576 |
|
}, |
|
{ |
|
"epoch": 5.367316341829086, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 3.924972816032953e-05, |
|
"loss": 0.1457, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 5.373313343328336, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 3.8975088472692475e-05, |
|
"loss": 0.1562, |
|
"step": 3584 |
|
}, |
|
{ |
|
"epoch": 5.379310344827586, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.870126947670392e-05, |
|
"loss": 0.199, |
|
"step": 3588 |
|
}, |
|
{ |
|
"epoch": 5.385307346326837, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.84282731964228e-05, |
|
"loss": 0.1527, |
|
"step": 3592 |
|
}, |
|
{ |
|
"epoch": 5.391304347826087, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.81561016498266e-05, |
|
"loss": 0.1487, |
|
"step": 3596 |
|
}, |
|
{ |
|
"epoch": 5.397301349325337, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.788475684879635e-05, |
|
"loss": 0.1573, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 5.4032983508245875, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.761424079910177e-05, |
|
"loss": 0.1872, |
|
"step": 3604 |
|
}, |
|
{ |
|
"epoch": 5.409295352323838, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.734455550038665e-05, |
|
"loss": 0.1693, |
|
"step": 3608 |
|
}, |
|
{ |
|
"epoch": 5.415292353823088, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.7075702946153665e-05, |
|
"loss": 0.216, |
|
"step": 3612 |
|
}, |
|
{ |
|
"epoch": 5.4212893553223385, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 3.680768512375017e-05, |
|
"loss": 0.1452, |
|
"step": 3616 |
|
}, |
|
{ |
|
"epoch": 5.42728635682159, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.654050401435287e-05, |
|
"loss": 0.1779, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 5.43328335832084, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.627416159295384e-05, |
|
"loss": 0.1939, |
|
"step": 3624 |
|
}, |
|
{ |
|
"epoch": 5.43928035982009, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 3.600865982834536e-05, |
|
"loss": 0.1767, |
|
"step": 3628 |
|
}, |
|
{ |
|
"epoch": 5.445277361319341, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.574400068310587e-05, |
|
"loss": 0.1565, |
|
"step": 3632 |
|
}, |
|
{ |
|
"epoch": 5.451274362818591, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 3.548018611358486e-05, |
|
"loss": 0.2105, |
|
"step": 3636 |
|
}, |
|
{ |
|
"epoch": 5.457271364317841, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 3.521721806988911e-05, |
|
"loss": 0.1788, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 5.4632683658170915, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.4955098495867603e-05, |
|
"loss": 0.168, |
|
"step": 3644 |
|
}, |
|
{ |
|
"epoch": 5.469265367316342, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 3.469382932909774e-05, |
|
"loss": 0.1773, |
|
"step": 3648 |
|
}, |
|
{ |
|
"epoch": 5.475262368815592, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 3.443341250087055e-05, |
|
"loss": 0.1772, |
|
"step": 3652 |
|
}, |
|
{ |
|
"epoch": 5.4812593703148424, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.417384993617664e-05, |
|
"loss": 0.182, |
|
"step": 3656 |
|
}, |
|
{ |
|
"epoch": 5.487256371814093, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 3.3915143553692076e-05, |
|
"loss": 0.1597, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 5.493253373313343, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 3.3657295265763906e-05, |
|
"loss": 0.1546, |
|
"step": 3664 |
|
}, |
|
{ |
|
"epoch": 5.499250374812593, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.3400306978396233e-05, |
|
"loss": 0.193, |
|
"step": 3668 |
|
}, |
|
{ |
|
"epoch": 5.505247376311845, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 3.3144180591236016e-05, |
|
"loss": 0.1674, |
|
"step": 3672 |
|
}, |
|
{ |
|
"epoch": 5.511244377811094, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.288891799755921e-05, |
|
"loss": 0.2008, |
|
"step": 3676 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 3.2634521084256554e-05, |
|
"loss": 0.1927, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 5.5232383808095955, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 3.2380991731819644e-05, |
|
"loss": 0.2101, |
|
"step": 3684 |
|
}, |
|
{ |
|
"epoch": 5.529235382308846, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 3.2128331814327304e-05, |
|
"loss": 0.174, |
|
"step": 3688 |
|
}, |
|
{ |
|
"epoch": 5.535232383808096, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 3.187654319943134e-05, |
|
"loss": 0.2115, |
|
"step": 3692 |
|
}, |
|
{ |
|
"epoch": 5.541229385307346, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 3.1625627748343016e-05, |
|
"loss": 0.1934, |
|
"step": 3696 |
|
}, |
|
{ |
|
"epoch": 5.547226386806597, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 3.137558731581914e-05, |
|
"loss": 0.1807, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 5.553223388305847, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 3.112642375014853e-05, |
|
"loss": 0.2024, |
|
"step": 3704 |
|
}, |
|
{ |
|
"epoch": 5.559220389805097, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 3.087813889313812e-05, |
|
"loss": 0.182, |
|
"step": 3708 |
|
}, |
|
{ |
|
"epoch": 5.565217391304348, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.063073458009952e-05, |
|
"loss": 0.1723, |
|
"step": 3712 |
|
}, |
|
{ |
|
"epoch": 5.571214392803598, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.0384212639835382e-05, |
|
"loss": 0.169, |
|
"step": 3716 |
|
}, |
|
{ |
|
"epoch": 5.577211394302848, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 3.013857489462595e-05, |
|
"loss": 0.1952, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 5.583208395802099, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 2.9893823160215446e-05, |
|
"loss": 0.1863, |
|
"step": 3724 |
|
}, |
|
{ |
|
"epoch": 5.589205397301349, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.964995924579875e-05, |
|
"loss": 0.1927, |
|
"step": 3728 |
|
}, |
|
{ |
|
"epoch": 5.5952023988006, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.94069849540081e-05, |
|
"loss": 0.1806, |
|
"step": 3732 |
|
}, |
|
{ |
|
"epoch": 5.60119940029985, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 2.9164902080899573e-05, |
|
"loss": 0.1705, |
|
"step": 3736 |
|
}, |
|
{ |
|
"epoch": 5.607196401799101, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 2.8923712415940037e-05, |
|
"loss": 0.177, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 5.613193403298351, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.86834177419936e-05, |
|
"loss": 0.1964, |
|
"step": 3744 |
|
}, |
|
{ |
|
"epoch": 5.619190404797601, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.844401983530887e-05, |
|
"loss": 0.1936, |
|
"step": 3748 |
|
}, |
|
{ |
|
"epoch": 5.625187406296852, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 2.8205520465505365e-05, |
|
"loss": 0.1755, |
|
"step": 3752 |
|
}, |
|
{ |
|
"epoch": 5.631184407796102, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.7967921395560894e-05, |
|
"loss": 0.177, |
|
"step": 3756 |
|
}, |
|
{ |
|
"epoch": 5.637181409295352, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.773122438179809e-05, |
|
"loss": 0.1952, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 5.643178410794603, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.749543117387164e-05, |
|
"loss": 0.1965, |
|
"step": 3764 |
|
}, |
|
{ |
|
"epoch": 5.649175412293853, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.7260543514755493e-05, |
|
"loss": 0.2069, |
|
"step": 3768 |
|
}, |
|
{ |
|
"epoch": 5.655172413793103, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.7026563140729657e-05, |
|
"loss": 0.2158, |
|
"step": 3772 |
|
}, |
|
{ |
|
"epoch": 5.6611694152923535, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.6793491781367578e-05, |
|
"loss": 0.1859, |
|
"step": 3776 |
|
}, |
|
{ |
|
"epoch": 5.667166416791604, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 2.6561331159523247e-05, |
|
"loss": 0.1472, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 5.673163418290855, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 2.633008299131868e-05, |
|
"loss": 0.1894, |
|
"step": 3784 |
|
}, |
|
{ |
|
"epoch": 5.679160419790105, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.609974898613093e-05, |
|
"loss": 0.2038, |
|
"step": 3788 |
|
}, |
|
{ |
|
"epoch": 5.685157421289356, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.5870330846579613e-05, |
|
"loss": 0.1641, |
|
"step": 3792 |
|
}, |
|
{ |
|
"epoch": 5.691154422788606, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 2.56418302685143e-05, |
|
"loss": 0.1894, |
|
"step": 3796 |
|
}, |
|
{ |
|
"epoch": 5.697151424287856, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.541424894100207e-05, |
|
"loss": 0.1738, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 5.703148425787107, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.5187588546314868e-05, |
|
"loss": 0.1835, |
|
"step": 3804 |
|
}, |
|
{ |
|
"epoch": 5.709145427286357, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 2.4961850759917068e-05, |
|
"loss": 0.1637, |
|
"step": 3808 |
|
}, |
|
{ |
|
"epoch": 5.715142428785607, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 2.4737037250453356e-05, |
|
"loss": 0.1893, |
|
"step": 3812 |
|
}, |
|
{ |
|
"epoch": 5.7211394302848575, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.4513149679736003e-05, |
|
"loss": 0.1852, |
|
"step": 3816 |
|
}, |
|
{ |
|
"epoch": 5.727136431784108, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.429018970273296e-05, |
|
"loss": 0.1963, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 5.733133433283358, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 2.406815896755522e-05, |
|
"loss": 0.1498, |
|
"step": 3824 |
|
}, |
|
{ |
|
"epoch": 5.739130434782608, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 2.3847059115445073e-05, |
|
"loss": 0.1895, |
|
"step": 3828 |
|
}, |
|
{ |
|
"epoch": 5.745127436281859, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 2.3626891780763584e-05, |
|
"loss": 0.1848, |
|
"step": 3832 |
|
}, |
|
{ |
|
"epoch": 5.75112443778111, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.3407658590978917e-05, |
|
"loss": 0.187, |
|
"step": 3836 |
|
}, |
|
{ |
|
"epoch": 5.757121439280359, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 2.3189361166653768e-05, |
|
"loss": 0.1572, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 5.7631184407796106, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 2.2972001121433976e-05, |
|
"loss": 0.1693, |
|
"step": 3844 |
|
}, |
|
{ |
|
"epoch": 5.769115442278861, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.2755580062036095e-05, |
|
"loss": 0.1786, |
|
"step": 3848 |
|
}, |
|
{ |
|
"epoch": 5.775112443778111, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 2.2540099588235903e-05, |
|
"loss": 0.1919, |
|
"step": 3852 |
|
}, |
|
{ |
|
"epoch": 5.7811094452773615, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 2.2325561292856314e-05, |
|
"loss": 0.1889, |
|
"step": 3856 |
|
}, |
|
{ |
|
"epoch": 5.787106446776612, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 2.2111966761755684e-05, |
|
"loss": 0.166, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 5.793103448275862, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.1899317573816187e-05, |
|
"loss": 0.182, |
|
"step": 3864 |
|
}, |
|
{ |
|
"epoch": 5.799100449775112, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.1687615300931975e-05, |
|
"loss": 0.188, |
|
"step": 3868 |
|
}, |
|
{ |
|
"epoch": 5.805097451274363, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.1476861507997677e-05, |
|
"loss": 0.1971, |
|
"step": 3872 |
|
}, |
|
{ |
|
"epoch": 5.811094452773613, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.1267057752896766e-05, |
|
"loss": 0.1775, |
|
"step": 3876 |
|
}, |
|
{ |
|
"epoch": 5.817091454272863, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 2.105820558649016e-05, |
|
"loss": 0.2004, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 5.823088455772114, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 2.0850306552604568e-05, |
|
"loss": 0.1598, |
|
"step": 3884 |
|
}, |
|
{ |
|
"epoch": 5.829085457271364, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.0643362188021218e-05, |
|
"loss": 0.1838, |
|
"step": 3888 |
|
}, |
|
{ |
|
"epoch": 5.835082458770614, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 2.0437374022464524e-05, |
|
"loss": 0.1578, |
|
"step": 3892 |
|
}, |
|
{ |
|
"epoch": 5.8410794602698655, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 2.0232343578590626e-05, |
|
"loss": 0.154, |
|
"step": 3896 |
|
}, |
|
{ |
|
"epoch": 5.847076461769116, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 2.0028272371976266e-05, |
|
"loss": 0.1684, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 5.853073463268366, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.98251619111075e-05, |
|
"loss": 0.1873, |
|
"step": 3904 |
|
}, |
|
{ |
|
"epoch": 5.859070464767616, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.9623013697368694e-05, |
|
"loss": 0.1873, |
|
"step": 3908 |
|
}, |
|
{ |
|
"epoch": 5.865067466266867, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 1.942182922503122e-05, |
|
"loss": 0.2, |
|
"step": 3912 |
|
}, |
|
{ |
|
"epoch": 5.871064467766117, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 1.9221609981242553e-05, |
|
"loss": 0.1689, |
|
"step": 3916 |
|
}, |
|
{ |
|
"epoch": 5.877061469265367, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 1.9022357446015185e-05, |
|
"loss": 0.1852, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 5.883058470764618, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.8824073092215865e-05, |
|
"loss": 0.1719, |
|
"step": 3924 |
|
}, |
|
{ |
|
"epoch": 5.889055472263868, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.8626758385554474e-05, |
|
"loss": 0.1839, |
|
"step": 3928 |
|
}, |
|
{ |
|
"epoch": 5.895052473763118, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.8430414784573287e-05, |
|
"loss": 0.1578, |
|
"step": 3932 |
|
}, |
|
{ |
|
"epoch": 5.901049475262369, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 1.8235043740636317e-05, |
|
"loss": 0.1848, |
|
"step": 3936 |
|
}, |
|
{ |
|
"epoch": 5.907046476761619, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 1.8040646697918344e-05, |
|
"loss": 0.197, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 5.913043478260869, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 1.784722509339452e-05, |
|
"loss": 0.1977, |
|
"step": 3944 |
|
}, |
|
{ |
|
"epoch": 5.91904047976012, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.76547803568294e-05, |
|
"loss": 0.1732, |
|
"step": 3948 |
|
}, |
|
{ |
|
"epoch": 5.925037481259371, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 1.7463313910766774e-05, |
|
"loss": 0.1901, |
|
"step": 3952 |
|
}, |
|
{ |
|
"epoch": 5.931034482758621, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.7272827170518773e-05, |
|
"loss": 0.1851, |
|
"step": 3956 |
|
}, |
|
{ |
|
"epoch": 5.937031484257871, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 1.7083321544155738e-05, |
|
"loss": 0.1888, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 5.943028485757122, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 1.6894798432495566e-05, |
|
"loss": 0.2085, |
|
"step": 3964 |
|
}, |
|
{ |
|
"epoch": 5.949025487256372, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.6707259229093413e-05, |
|
"loss": 0.169, |
|
"step": 3968 |
|
}, |
|
{ |
|
"epoch": 5.955022488755622, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 1.6520705320231532e-05, |
|
"loss": 0.1875, |
|
"step": 3972 |
|
}, |
|
{ |
|
"epoch": 5.9610194902548725, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 1.633513808490884e-05, |
|
"loss": 0.1768, |
|
"step": 3976 |
|
}, |
|
{ |
|
"epoch": 5.967016491754123, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.6150558894830816e-05, |
|
"loss": 0.1643, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 5.973013493253373, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 1.596696911439934e-05, |
|
"loss": 0.1737, |
|
"step": 3984 |
|
}, |
|
{ |
|
"epoch": 5.9790104947526235, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 1.5784370100702685e-05, |
|
"loss": 0.1728, |
|
"step": 3988 |
|
}, |
|
{ |
|
"epoch": 5.985007496251874, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.5602763203505318e-05, |
|
"loss": 0.1788, |
|
"step": 3992 |
|
}, |
|
{ |
|
"epoch": 5.991004497751124, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.542214976523809e-05, |
|
"loss": 0.1671, |
|
"step": 3996 |
|
}, |
|
{ |
|
"epoch": 5.997001499250375, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 1.5242531120988189e-05, |
|
"loss": 0.2023, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.002998500749626, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 1.5063908598489388e-05, |
|
"loss": 0.1644, |
|
"step": 4004 |
|
}, |
|
{ |
|
"epoch": 6.008995502248876, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.4886283518112136e-05, |
|
"loss": 0.1648, |
|
"step": 4008 |
|
}, |
|
{ |
|
"epoch": 6.014992503748126, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.4709657192853791e-05, |
|
"loss": 0.1742, |
|
"step": 4012 |
|
}, |
|
{ |
|
"epoch": 6.0209895052473765, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.4534030928329054e-05, |
|
"loss": 0.1818, |
|
"step": 4016 |
|
}, |
|
{ |
|
"epoch": 6.026986506746627, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 1.4359406022760105e-05, |
|
"loss": 0.1813, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 6.032983508245877, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 1.4185783766967262e-05, |
|
"loss": 0.1611, |
|
"step": 4024 |
|
}, |
|
{ |
|
"epoch": 6.0389805097451275, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 1.401316544435907e-05, |
|
"loss": 0.1616, |
|
"step": 4028 |
|
}, |
|
{ |
|
"epoch": 6.044977511244378, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.3841552330923277e-05, |
|
"loss": 0.1549, |
|
"step": 4032 |
|
}, |
|
{ |
|
"epoch": 6.050974512743628, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.3670945695217028e-05, |
|
"loss": 0.1715, |
|
"step": 4036 |
|
}, |
|
{ |
|
"epoch": 6.056971514242878, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 1.3501346798357714e-05, |
|
"loss": 0.1811, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 6.062968515742129, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.3332756894013425e-05, |
|
"loss": 0.1829, |
|
"step": 4044 |
|
}, |
|
{ |
|
"epoch": 6.068965517241379, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.3165177228393941e-05, |
|
"loss": 0.159, |
|
"step": 4048 |
|
}, |
|
{ |
|
"epoch": 6.074962518740629, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 1.2998609040241393e-05, |
|
"loss": 0.1612, |
|
"step": 4052 |
|
}, |
|
{ |
|
"epoch": 6.08095952023988, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.2833053560821066e-05, |
|
"loss": 0.1986, |
|
"step": 4056 |
|
}, |
|
{ |
|
"epoch": 6.086956521739131, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.266851201391234e-05, |
|
"loss": 0.174, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 6.092953523238381, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.250498561579964e-05, |
|
"loss": 0.1619, |
|
"step": 4064 |
|
}, |
|
{ |
|
"epoch": 6.098950524737631, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.2342475575263555e-05, |
|
"loss": 0.1733, |
|
"step": 4068 |
|
}, |
|
{ |
|
"epoch": 6.104947526236882, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 1.2180983093571656e-05, |
|
"loss": 0.1707, |
|
"step": 4072 |
|
}, |
|
{ |
|
"epoch": 6.110944527736132, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 1.202050936446986e-05, |
|
"loss": 0.1543, |
|
"step": 4076 |
|
}, |
|
{ |
|
"epoch": 6.116941529235382, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 1.1861055574173427e-05, |
|
"loss": 0.1436, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 6.122938530734633, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 1.1702622901358383e-05, |
|
"loss": 0.1772, |
|
"step": 4084 |
|
}, |
|
{ |
|
"epoch": 6.128935532233883, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.154521251715257e-05, |
|
"loss": 0.1667, |
|
"step": 4088 |
|
}, |
|
{ |
|
"epoch": 6.134932533733133, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 1.1388825585127175e-05, |
|
"loss": 0.1919, |
|
"step": 4092 |
|
}, |
|
{ |
|
"epoch": 6.140929535232384, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.1233463261288111e-05, |
|
"loss": 0.1616, |
|
"step": 4096 |
|
}, |
|
{ |
|
"epoch": 6.146926536731634, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.1079126694067359e-05, |
|
"loss": 0.1386, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 6.152923538230884, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.0925817024314548e-05, |
|
"loss": 0.1799, |
|
"step": 4104 |
|
}, |
|
{ |
|
"epoch": 6.1589205397301345, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 1.077353538528855e-05, |
|
"loss": 0.1693, |
|
"step": 4108 |
|
}, |
|
{ |
|
"epoch": 6.164917541229386, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.0622282902649116e-05, |
|
"loss": 0.1523, |
|
"step": 4112 |
|
}, |
|
{ |
|
"epoch": 6.170914542728636, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 1.0472060694448442e-05, |
|
"loss": 0.1635, |
|
"step": 4116 |
|
}, |
|
{ |
|
"epoch": 6.176911544227886, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 1.032286987112299e-05, |
|
"loss": 0.1727, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 6.182908545727137, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 1.0174711535485286e-05, |
|
"loss": 0.1638, |
|
"step": 4124 |
|
}, |
|
{ |
|
"epoch": 6.188905547226387, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 1.0027586782715774e-05, |
|
"loss": 0.1769, |
|
"step": 4128 |
|
}, |
|
{ |
|
"epoch": 6.194902548725637, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.881496700354646e-06, |
|
"loss": 0.1582, |
|
"step": 4132 |
|
}, |
|
{ |
|
"epoch": 6.200899550224888, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 9.736442368293861e-06, |
|
"loss": 0.1645, |
|
"step": 4136 |
|
}, |
|
{ |
|
"epoch": 6.206896551724138, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.592424858769204e-06, |
|
"loss": 0.1661, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 6.212893553223388, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.44944523635222e-06, |
|
"loss": 0.1379, |
|
"step": 4144 |
|
}, |
|
{ |
|
"epoch": 6.2188905547226385, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 9.307504557942564e-06, |
|
"loss": 0.1912, |
|
"step": 4148 |
|
}, |
|
{ |
|
"epoch": 6.224887556221889, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.166603872759875e-06, |
|
"loss": 0.1775, |
|
"step": 4152 |
|
}, |
|
{ |
|
"epoch": 6.230884557721139, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 9.026744222336403e-06, |
|
"loss": 0.1539, |
|
"step": 4156 |
|
}, |
|
{ |
|
"epoch": 6.2368815592203894, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 8.887926640508942e-06, |
|
"loss": 0.1524, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 6.24287856071964, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 8.750152153411506e-06, |
|
"loss": 0.1624, |
|
"step": 4164 |
|
}, |
|
{ |
|
"epoch": 6.248875562218891, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 8.61342177946749e-06, |
|
"loss": 0.1424, |
|
"step": 4168 |
|
}, |
|
{ |
|
"epoch": 6.254872563718141, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.477736529382262e-06, |
|
"loss": 0.1799, |
|
"step": 4172 |
|
}, |
|
{ |
|
"epoch": 6.260869565217392, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 8.343097406135723e-06, |
|
"loss": 0.1645, |
|
"step": 4176 |
|
}, |
|
{ |
|
"epoch": 6.266866566716642, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 8.20950540497481e-06, |
|
"loss": 0.1806, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 6.272863568215892, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 8.076961513406177e-06, |
|
"loss": 0.1766, |
|
"step": 4184 |
|
}, |
|
{ |
|
"epoch": 6.2788605697151425, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 7.945466711188885e-06, |
|
"loss": 0.1951, |
|
"step": 4188 |
|
}, |
|
{ |
|
"epoch": 6.284857571214393, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 7.815021970327229e-06, |
|
"loss": 0.1617, |
|
"step": 4192 |
|
}, |
|
{ |
|
"epoch": 6.290854572713643, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 7.68562825506341e-06, |
|
"loss": 0.1674, |
|
"step": 4196 |
|
}, |
|
{ |
|
"epoch": 6.296851574212893, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 7.5572865218705595e-06, |
|
"loss": 0.166, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 6.302848575712144, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 7.429997719445535e-06, |
|
"loss": 0.147, |
|
"step": 4204 |
|
}, |
|
{ |
|
"epoch": 6.308845577211394, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 7.30376278870205e-06, |
|
"loss": 0.1955, |
|
"step": 4208 |
|
}, |
|
{ |
|
"epoch": 6.314842578710644, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 7.178582662763566e-06, |
|
"loss": 0.1965, |
|
"step": 4212 |
|
}, |
|
{ |
|
"epoch": 6.320839580209895, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.0544582669564975e-06, |
|
"loss": 0.1743, |
|
"step": 4216 |
|
}, |
|
{ |
|
"epoch": 6.326836581709145, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 6.931390518803387e-06, |
|
"loss": 0.1767, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 6.332833583208396, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 6.8093803280160066e-06, |
|
"loss": 0.1607, |
|
"step": 4224 |
|
}, |
|
{ |
|
"epoch": 6.3388305847076465, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 6.688428596488798e-06, |
|
"loss": 0.1645, |
|
"step": 4228 |
|
}, |
|
{ |
|
"epoch": 6.344827586206897, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 6.568536218291981e-06, |
|
"loss": 0.1841, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 6.350824587706147, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 6.4497040796652355e-06, |
|
"loss": 0.171, |
|
"step": 4236 |
|
}, |
|
{ |
|
"epoch": 6.356821589205397, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 6.331933059010846e-06, |
|
"loss": 0.179, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 6.362818590704648, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 6.215224026887505e-06, |
|
"loss": 0.1605, |
|
"step": 4244 |
|
}, |
|
{ |
|
"epoch": 6.368815592203898, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 6.099577846003567e-06, |
|
"loss": 0.1864, |
|
"step": 4248 |
|
}, |
|
{ |
|
"epoch": 6.374812593703148, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 5.984995371210971e-06, |
|
"loss": 0.166, |
|
"step": 4252 |
|
}, |
|
{ |
|
"epoch": 6.380809595202399, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 5.871477449498729e-06, |
|
"loss": 0.1881, |
|
"step": 4256 |
|
}, |
|
{ |
|
"epoch": 6.386806596701649, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.759024919986699e-06, |
|
"loss": 0.2102, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 6.392803598200899, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 5.647638613919437e-06, |
|
"loss": 0.1468, |
|
"step": 4264 |
|
}, |
|
{ |
|
"epoch": 6.39880059970015, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 5.537319354659969e-06, |
|
"loss": 0.203, |
|
"step": 4268 |
|
}, |
|
{ |
|
"epoch": 6.4047976011994, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.4280679576838515e-06, |
|
"loss": 0.1715, |
|
"step": 4272 |
|
}, |
|
{ |
|
"epoch": 6.410794602698651, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.319885230572951e-06, |
|
"loss": 0.1833, |
|
"step": 4276 |
|
}, |
|
{ |
|
"epoch": 6.416791604197901, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 5.2127719730096055e-06, |
|
"loss": 0.1797, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 6.422788605697152, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.1067289767706575e-06, |
|
"loss": 0.1667, |
|
"step": 4284 |
|
}, |
|
{ |
|
"epoch": 6.428785607196402, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 5.001757025721698e-06, |
|
"loss": 0.1717, |
|
"step": 4288 |
|
}, |
|
{ |
|
"epoch": 6.434782608695652, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 4.897856895811081e-06, |
|
"loss": 0.1724, |
|
"step": 4292 |
|
}, |
|
{ |
|
"epoch": 6.440779610194903, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 4.7950293550643505e-06, |
|
"loss": 0.1764, |
|
"step": 4296 |
|
}, |
|
{ |
|
"epoch": 6.446776611694153, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 4.6932751635785746e-06, |
|
"loss": 0.205, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 6.452773613193403, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 4.592595073516603e-06, |
|
"loss": 0.184, |
|
"step": 4304 |
|
}, |
|
{ |
|
"epoch": 6.458770614692654, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 4.492989829101551e-06, |
|
"loss": 0.1755, |
|
"step": 4308 |
|
}, |
|
{ |
|
"epoch": 6.464767616191904, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 4.394460166611341e-06, |
|
"loss": 0.1813, |
|
"step": 4312 |
|
}, |
|
{ |
|
"epoch": 6.470764617691154, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 4.297006814373305e-06, |
|
"loss": 0.1683, |
|
"step": 4316 |
|
}, |
|
{ |
|
"epoch": 6.4767616191904045, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 4.200630492758638e-06, |
|
"loss": 0.1257, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 6.482758620689655, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 4.105331914177224e-06, |
|
"loss": 0.1559, |
|
"step": 4324 |
|
}, |
|
{ |
|
"epoch": 6.488755622188906, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 4.0111117830722465e-06, |
|
"loss": 0.1228, |
|
"step": 4328 |
|
}, |
|
{ |
|
"epoch": 6.494752623688156, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 3.917970795915154e-06, |
|
"loss": 0.1717, |
|
"step": 4332 |
|
}, |
|
{ |
|
"epoch": 6.500749625187407, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.825909641200326e-06, |
|
"loss": 0.1809, |
|
"step": 4336 |
|
}, |
|
{ |
|
"epoch": 6.506746626686657, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 3.73492899944009e-06, |
|
"loss": 0.1642, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 6.512743628185907, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.645029543159683e-06, |
|
"loss": 0.1672, |
|
"step": 4344 |
|
}, |
|
{ |
|
"epoch": 6.5187406296851576, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 3.5562119368922006e-06, |
|
"loss": 0.1804, |
|
"step": 4348 |
|
}, |
|
{ |
|
"epoch": 6.524737631184408, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.46847683717385e-06, |
|
"loss": 0.1683, |
|
"step": 4352 |
|
}, |
|
{ |
|
"epoch": 6.530734632683658, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.3818248925388756e-06, |
|
"loss": 0.1622, |
|
"step": 4356 |
|
}, |
|
{ |
|
"epoch": 6.5367316341829085, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.2962567435149744e-06, |
|
"loss": 0.1687, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 6.542728635682159, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 3.2117730226184358e-06, |
|
"loss": 0.1695, |
|
"step": 4364 |
|
}, |
|
{ |
|
"epoch": 6.548725637181409, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 3.128374354349494e-06, |
|
"loss": 0.1884, |
|
"step": 4368 |
|
}, |
|
{ |
|
"epoch": 6.554722638680659, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 3.0460613551877513e-06, |
|
"loss": 0.1671, |
|
"step": 4372 |
|
}, |
|
{ |
|
"epoch": 6.56071964017991, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.9648346335875094e-06, |
|
"loss": 0.177, |
|
"step": 4376 |
|
}, |
|
{ |
|
"epoch": 6.566716641679161, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 2.884694789973463e-06, |
|
"loss": 0.1746, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 6.57271364317841, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 2.805642416736048e-06, |
|
"loss": 0.1662, |
|
"step": 4384 |
|
}, |
|
{ |
|
"epoch": 6.5787106446776615, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 2.7276780982272485e-06, |
|
"loss": 0.1771, |
|
"step": 4388 |
|
}, |
|
{ |
|
"epoch": 6.584707646176912, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 2.650802410756081e-06, |
|
"loss": 0.1639, |
|
"step": 4392 |
|
}, |
|
{ |
|
"epoch": 6.590704647676162, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 2.5750159225845835e-06, |
|
"loss": 0.16, |
|
"step": 4396 |
|
}, |
|
{ |
|
"epoch": 6.5967016491754125, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 2.5003191939233668e-06, |
|
"loss": 0.1625, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 6.602698650674663, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 2.4267127769276364e-06, |
|
"loss": 0.1752, |
|
"step": 4404 |
|
}, |
|
{ |
|
"epoch": 6.608695652173913, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 2.3541972156930267e-06, |
|
"loss": 0.1614, |
|
"step": 4408 |
|
}, |
|
{ |
|
"epoch": 6.614692653673163, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 2.2827730462516567e-06, |
|
"loss": 0.1577, |
|
"step": 4412 |
|
}, |
|
{ |
|
"epoch": 6.620689655172414, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 2.2124407965680825e-06, |
|
"loss": 0.1518, |
|
"step": 4416 |
|
}, |
|
{ |
|
"epoch": 6.626686656671664, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 2.1432009865354316e-06, |
|
"loss": 0.1781, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 6.632683658170914, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.0750541279715925e-06, |
|
"loss": 0.1576, |
|
"step": 4424 |
|
}, |
|
{ |
|
"epoch": 6.638680659670165, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.0080007246153662e-06, |
|
"loss": 0.1574, |
|
"step": 4428 |
|
}, |
|
{ |
|
"epoch": 6.644677661169415, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.942041272122835e-06, |
|
"loss": 0.1695, |
|
"step": 4432 |
|
}, |
|
{ |
|
"epoch": 6.650674662668665, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 1.8771762580635508e-06, |
|
"loss": 0.1483, |
|
"step": 4436 |
|
}, |
|
{ |
|
"epoch": 6.6566716641679164, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 1.8134061619170858e-06, |
|
"loss": 0.151, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 6.662668665667167, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.750731455069404e-06, |
|
"loss": 0.1499, |
|
"step": 4444 |
|
}, |
|
{ |
|
"epoch": 6.668665667166417, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.6891526008094292e-06, |
|
"loss": 0.1633, |
|
"step": 4448 |
|
}, |
|
{ |
|
"epoch": 6.674662668665667, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 1.628670054325515e-06, |
|
"loss": 0.1664, |
|
"step": 4452 |
|
}, |
|
{ |
|
"epoch": 6.680659670164918, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 1.5692842627021973e-06, |
|
"loss": 0.1632, |
|
"step": 4456 |
|
}, |
|
{ |
|
"epoch": 6.686656671664168, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.510995664916881e-06, |
|
"loss": 0.1701, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 6.692653673163418, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 1.4538046918365076e-06, |
|
"loss": 0.1586, |
|
"step": 4464 |
|
}, |
|
{ |
|
"epoch": 6.698650674662669, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.39771176621441e-06, |
|
"loss": 0.2057, |
|
"step": 4468 |
|
}, |
|
{ |
|
"epoch": 6.704647676161919, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 1.3427173026872295e-06, |
|
"loss": 0.1734, |
|
"step": 4472 |
|
}, |
|
{ |
|
"epoch": 6.710644677661169, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 1.2888217077718367e-06, |
|
"loss": 0.1619, |
|
"step": 4476 |
|
}, |
|
{ |
|
"epoch": 6.7166416791604195, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 1.2360253798622488e-06, |
|
"loss": 0.1809, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 6.72263868065967, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 1.1843287092268173e-06, |
|
"loss": 0.1672, |
|
"step": 4484 |
|
}, |
|
{ |
|
"epoch": 6.72863568215892, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.1337320780052117e-06, |
|
"loss": 0.2092, |
|
"step": 4488 |
|
}, |
|
{ |
|
"epoch": 6.734632683658171, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 1.0842358602056899e-06, |
|
"loss": 0.1593, |
|
"step": 4492 |
|
}, |
|
{ |
|
"epoch": 6.740629685157422, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.0358404217022997e-06, |
|
"loss": 0.1937, |
|
"step": 4496 |
|
}, |
|
{ |
|
"epoch": 6.746626686656672, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 9.885461202321475e-07, |
|
"loss": 0.1879, |
|
"step": 4500 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 4669, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.965429329913184e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|