|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.0, |
|
"eval_steps": 500, |
|
"global_step": 4347, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00644122383252818, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.727272727272727e-05, |
|
"loss": 1.3628, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01288244766505636, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 5.454545454545454e-05, |
|
"loss": 1.3272, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01932367149758454, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.18181818181818e-05, |
|
"loss": 1.2626, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02576489533011272, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00010909090909090908, |
|
"loss": 1.2028, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0322061191626409, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.00013636363636363634, |
|
"loss": 1.1822, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03864734299516908, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0001636363636363636, |
|
"loss": 1.2029, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04508856682769726, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0001909090909090909, |
|
"loss": 1.1609, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05152979066022544, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00021818181818181816, |
|
"loss": 1.1137, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.057971014492753624, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00024545454545454545, |
|
"loss": 1.085, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0644122383252818, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0002727272727272727, |
|
"loss": 1.1052, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07085346215780998, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0003, |
|
"loss": 1.0712, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.07729468599033816, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00029999936035650057, |
|
"loss": 1.0588, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.08373590982286634, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0002999974414314574, |
|
"loss": 1.0531, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09017713365539452, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00029999424324123633, |
|
"loss": 0.9953, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0966183574879227, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0002999897658131134, |
|
"loss": 0.9887, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10305958132045089, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0002999840091852746, |
|
"loss": 0.9945, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.10950080515297907, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00029997697340681585, |
|
"loss": 0.9306, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.11594202898550725, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00029996865853774236, |
|
"loss": 0.9458, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.12238325281803543, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00029995906464896807, |
|
"loss": 0.9487, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1288244766505636, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002999481918223153, |
|
"loss": 0.9144, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13526570048309178, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0002999360401505139, |
|
"loss": 0.9289, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.14170692431561996, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00029992260973720023, |
|
"loss": 0.882, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00029990790069691665, |
|
"loss": 0.9031, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.15458937198067632, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00029989191315511055, |
|
"loss": 0.9127, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1610305958132045, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0002998746472481328, |
|
"loss": 0.8803, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16747181964573268, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002998561031232371, |
|
"loss": 0.8764, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00029983628093857855, |
|
"loss": 0.9189, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.18035426731078905, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00029981518086321225, |
|
"loss": 0.8169, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.18679549114331723, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00029979280307709176, |
|
"loss": 0.8672, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1932367149758454, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002997691477710679, |
|
"loss": 0.8387, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1996779388083736, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002997442151468869, |
|
"loss": 0.8039, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.20611916264090177, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00029971800541718854, |
|
"loss": 0.8294, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.21256038647342995, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002996905188055046, |
|
"loss": 0.8228, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.21900161030595813, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00029966175554625696, |
|
"loss": 0.8341, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.22544283413848631, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00029963171588475525, |
|
"loss": 0.8095, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2318840579710145, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002996004000771952, |
|
"loss": 0.8285, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.23832528180354268, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00029956780839065616, |
|
"loss": 0.8123, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.24476650563607086, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00029953394110309887, |
|
"loss": 0.7612, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.25120772946859904, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002994987985033633, |
|
"loss": 0.7723, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2576489533011272, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002994623808911659, |
|
"loss": 0.8202, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2640901771336554, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00029942468857709715, |
|
"loss": 0.7324, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.27053140096618356, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.000299385721882619, |
|
"loss": 0.7818, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.27697262479871176, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.000299345481140062, |
|
"loss": 0.7693, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.2834138486312399, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00029930396669262255, |
|
"loss": 0.7481, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.2898550724637681, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00029926117889435993, |
|
"loss": 0.7478, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00029921711811019334, |
|
"loss": 0.7581, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.3027375201288245, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00029917178471589864, |
|
"loss": 0.7131, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.30917874396135264, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002991251790981053, |
|
"loss": 0.7121, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.31561996779388085, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002990773016542932, |
|
"loss": 0.7385, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.322061191626409, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00029902815279278874, |
|
"loss": 0.743, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3285024154589372, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00029897773293276214, |
|
"loss": 0.6984, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.33494363929146537, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.000298926042504223, |
|
"loss": 0.7278, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3413848631239936, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00029887308194801745, |
|
"loss": 0.7043, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00029881885171582364, |
|
"loss": 0.7455, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.35426731078904994, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002987633522701486, |
|
"loss": 0.7314, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3607085346215781, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00029870658408432375, |
|
"loss": 0.7344, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.3671497584541063, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002986485476425011, |
|
"loss": 0.7324, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.37359098228663445, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002985892434396491, |
|
"loss": 0.7197, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.38003220611916266, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00029852867198154837, |
|
"loss": 0.6616, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.3864734299516908, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002984668337847874, |
|
"loss": 0.6325, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.392914653784219, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002984037293767583, |
|
"loss": 0.6445, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3993558776167472, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00029833935929565194, |
|
"loss": 0.6846, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.4057971014492754, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00029827372409045377, |
|
"loss": 0.6976, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.41223832528180354, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0002982068243209389, |
|
"loss": 0.7165, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.41867954911433175, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00029813866055766736, |
|
"loss": 0.6647, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4251207729468599, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00029806923338197925, |
|
"loss": 0.6809, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.43156199677938806, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00029799854338598974, |
|
"loss": 0.7285, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.43800322061191627, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0002979265911725842, |
|
"loss": 0.6978, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00029785337735541276, |
|
"loss": 0.6598, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.45088566827697263, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002977789025588854, |
|
"loss": 0.6534, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4573268921095008, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002977031674181663, |
|
"loss": 0.7261, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.463768115942029, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00029762617257916873, |
|
"loss": 0.6762, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.47020933977455714, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0002975479186985493, |
|
"loss": 0.6625, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.47665056360708535, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002974684064437025, |
|
"loss": 0.6617, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.4830917874396135, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00029738763649275496, |
|
"loss": 0.6886, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4895330112721417, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002973056095345596, |
|
"loss": 0.6623, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.49597423510466987, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00029722232626869, |
|
"loss": 0.6568, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5024154589371981, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002971377874054341, |
|
"loss": 0.6281, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5088566827697263, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002970519936657884, |
|
"loss": 0.6618, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5152979066022544, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00029696494578145157, |
|
"loss": 0.6797, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002968766444948185, |
|
"loss": 0.6756, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.5281803542673108, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002967870905589739, |
|
"loss": 0.698, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.534621578099839, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002966962847376855, |
|
"loss": 0.6431, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.5410628019323671, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00029660422780539814, |
|
"loss": 0.6713, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5475040257648953, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00029651092054722665, |
|
"loss": 0.615, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5539452495974235, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002964163637589495, |
|
"loss": 0.7173, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.5603864734299517, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0002963205582470017, |
|
"loss": 0.6808, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.5668276972624798, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00029622350482846844, |
|
"loss": 0.6684, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.573268921095008, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00029612520433107734, |
|
"loss": 0.644, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.5797101449275363, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002960256575931922, |
|
"loss": 0.6599, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5861513687600645, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002959248654638053, |
|
"loss": 0.7006, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00029582282880253035, |
|
"loss": 0.625, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.5990338164251208, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002957195484795952, |
|
"loss": 0.7234, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.605475040257649, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002956150253758344, |
|
"loss": 0.6556, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.6119162640901772, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00029550926038268146, |
|
"loss": 0.6402, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6183574879227053, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002954022544021617, |
|
"loss": 0.6446, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.6247987117552335, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00029529400834688415, |
|
"loss": 0.6379, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.6312399355877617, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00029518452314003394, |
|
"loss": 0.644, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.6376811594202898, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002950737997153645, |
|
"loss": 0.6413, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.644122383252818, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00029496183901718927, |
|
"loss": 0.6249, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6505636070853462, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00029484864200037415, |
|
"loss": 0.5929, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.6570048309178744, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002947342096303289, |
|
"loss": 0.6447, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.6634460547504025, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002946185428829991, |
|
"loss": 0.641, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.6698872785829307, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0002945016427448579, |
|
"loss": 0.6878, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.6763285024154589, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002943835102128975, |
|
"loss": 0.6646, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6827697262479872, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002942641462946206, |
|
"loss": 0.613, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.6892109500805152, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00029414355200803197, |
|
"loss": 0.6135, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002940217283816296, |
|
"loss": 0.6145, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.7020933977455717, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002938986764543961, |
|
"loss": 0.6199, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.7085346215780999, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002937743972757895, |
|
"loss": 0.6566, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.714975845410628, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002936488919057349, |
|
"loss": 0.6536, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.7214170692431562, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002935221614146148, |
|
"loss": 0.6586, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.7278582930756844, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002933942068832604, |
|
"loss": 0.6234, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.7342995169082126, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00029326502940294207, |
|
"loss": 0.6115, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00029313463007536034, |
|
"loss": 0.6205, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7471819645732689, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002930030100126363, |
|
"loss": 0.6185, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.7536231884057971, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0002928701703373021, |
|
"loss": 0.6305, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.7600644122383253, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00029273611218229165, |
|
"loss": 0.6464, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.7665056360708534, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002926008366909307, |
|
"loss": 0.6488, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.7729468599033816, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00029246434501692685, |
|
"loss": 0.6148, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7793880837359098, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00029232663832436047, |
|
"loss": 0.5946, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.785829307568438, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002921877177876741, |
|
"loss": 0.5898, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.7922705314009661, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002920475845916626, |
|
"loss": 0.6435, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.7987117552334944, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00029190623993146313, |
|
"loss": 0.6605, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.8051529790660226, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002917636850125449, |
|
"loss": 0.6297, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8115942028985508, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00029161992105069905, |
|
"loss": 0.6313, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.8180354267310789, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002914749492720279, |
|
"loss": 0.5953, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.8244766505636071, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00029132877091293493, |
|
"loss": 0.6615, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.8309178743961353, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.000291181387220114, |
|
"loss": 0.6771, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.8373590982286635, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002910327994505387, |
|
"loss": 0.5889, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8438003220611916, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002908830088714516, |
|
"loss": 0.5781, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.8502415458937198, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00029073201676035383, |
|
"loss": 0.6182, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.856682769726248, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00029057982440499356, |
|
"loss": 0.6226, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.8631239935587761, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00029042643310335547, |
|
"loss": 0.6547, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00029027184416364956, |
|
"loss": 0.6114, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8760064412238325, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002901160589043, |
|
"loss": 0.6491, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.8824476650563607, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00028995907865393385, |
|
"loss": 0.6375, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00028980090475136963, |
|
"loss": 0.6083, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.895330112721417, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002896415385456062, |
|
"loss": 0.5879, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.9017713365539453, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.000289480981395811, |
|
"loss": 0.6596, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9082125603864735, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00028931923467130855, |
|
"loss": 0.5774, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.9146537842190016, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00028915629975156867, |
|
"loss": 0.6118, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.9210950080515298, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002889921780261949, |
|
"loss": 0.615, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.927536231884058, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00028882687089491234, |
|
"loss": 0.6225, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.9339774557165862, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002886603797675563, |
|
"loss": 0.5626, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9404186795491143, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002884927060640596, |
|
"loss": 0.5886, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.9468599033816425, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0002883238512144409, |
|
"loss": 0.6251, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.9533011272141707, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002881538166587921, |
|
"loss": 0.6326, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.9597423510466989, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002879826038472667, |
|
"loss": 0.5666, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.966183574879227, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00028781021424006677, |
|
"loss": 0.5282, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9726247987117552, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00028763664930743087, |
|
"loss": 0.6628, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.9790660225442834, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00028746191052962146, |
|
"loss": 0.5669, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.9855072463768116, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00028728599939691215, |
|
"loss": 0.5955, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.9919484702093397, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00028710891740957507, |
|
"loss": 0.5995, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.998389694041868, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00028693066607786823, |
|
"loss": 0.5813, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0048309178743962, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0002867512469220222, |
|
"loss": 0.5306, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.0112721417069244, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00028657066147222773, |
|
"loss": 0.4918, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.0177133655394526, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00028638891126862224, |
|
"loss": 0.5198, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.0241545893719808, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0002862059978612769, |
|
"loss": 0.5673, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.0305958132045088, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00028602192281018327, |
|
"loss": 0.5127, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002858366876852403, |
|
"loss": 0.5517, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.0434782608695652, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002856502940662403, |
|
"loss": 0.5209, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.0499194847020934, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00028546274354285646, |
|
"loss": 0.5362, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.0563607085346216, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00028527403771462826, |
|
"loss": 0.5256, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.0628019323671498, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00028508417819094844, |
|
"loss": 0.5257, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.069243156199678, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002848931665910492, |
|
"loss": 0.4971, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.075684380032206, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002847010045439882, |
|
"loss": 0.5214, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.0821256038647342, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002845076936886349, |
|
"loss": 0.5283, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.0885668276972624, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002843132356736563, |
|
"loss": 0.5024, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.0950080515297906, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002841176321575032, |
|
"loss": 0.5515, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.1014492753623188, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002839208848083958, |
|
"loss": 0.5493, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.107890499194847, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0002837229953043096, |
|
"loss": 0.4908, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.1143317230273753, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002835239653329611, |
|
"loss": 0.5136, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.1207729468599035, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002833237965917934, |
|
"loss": 0.5379, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.1272141706924317, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002831224907879614, |
|
"loss": 0.5059, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1336553945249597, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00028292004963831796, |
|
"loss": 0.5231, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.1400966183574879, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00028271647486939855, |
|
"loss": 0.5223, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.146537842190016, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002825117682174069, |
|
"loss": 0.4907, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.1529790660225443, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002823059314282, |
|
"loss": 0.4996, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.1594202898550725, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0002820989662572734, |
|
"loss": 0.5084, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.1658615136876007, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002818908744697461, |
|
"loss": 0.4909, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.1723027375201287, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00028168165784034566, |
|
"loss": 0.5245, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.178743961352657, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00028147131815339267, |
|
"loss": 0.5307, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00028125985720278614, |
|
"loss": 0.5213, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.1916264090177133, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002810472767919876, |
|
"loss": 0.5257, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1980676328502415, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002808335787340061, |
|
"loss": 0.4913, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.2045088566827697, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00028061876485138264, |
|
"loss": 0.5331, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.210950080515298, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00028040283697617464, |
|
"loss": 0.5055, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.2173913043478262, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002801857969499402, |
|
"loss": 0.5318, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.2238325281803544, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0002799676466237225, |
|
"loss": 0.4991, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.2302737520128824, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002797483878580342, |
|
"loss": 0.5059, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.2367149758454106, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00027952802252284104, |
|
"loss": 0.5043, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.2431561996779388, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0002793065524975465, |
|
"loss": 0.5747, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.249597423510467, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002790839796709755, |
|
"loss": 0.5082, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.2560386473429952, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00027886030594135805, |
|
"loss": 0.5369, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2624798711755234, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002786355332163135, |
|
"loss": 0.5423, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.2689210950080514, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.000278409663412834, |
|
"loss": 0.4882, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.2753623188405796, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002781826984572683, |
|
"loss": 0.504, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.2818035426731078, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002779546402853051, |
|
"loss": 0.4872, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.288244766505636, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00027772549084195675, |
|
"loss": 0.5348, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2946859903381642, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00027749525208154265, |
|
"loss": 0.5718, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.3011272141706924, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002772639259676726, |
|
"loss": 0.5393, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.3075684380032206, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00027703151447322965, |
|
"loss": 0.5421, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.3140096618357489, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002767980195803539, |
|
"loss": 0.5555, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.320450885668277, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002765634432804253, |
|
"loss": 0.553, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.3268921095008053, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00027632778757404655, |
|
"loss": 0.5075, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002760910544710261, |
|
"loss": 0.4933, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.3397745571658615, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00027585324599036133, |
|
"loss": 0.5039, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.3462157809983897, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00027561436416022073, |
|
"loss": 0.5175, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.3526570048309179, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00027537441101792715, |
|
"loss": 0.5375, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.359098228663446, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002751333886099402, |
|
"loss": 0.5235, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.3655394524959743, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002748912989918387, |
|
"loss": 0.4882, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.3719806763285023, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002746481442283034, |
|
"loss": 0.5032, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.3784219001610305, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002744039263930991, |
|
"loss": 0.5052, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.3848631239935587, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002741586475690571, |
|
"loss": 0.5538, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002739123098480576, |
|
"loss": 0.5457, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.3977455716586151, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00027366491533101147, |
|
"loss": 0.5111, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.4041867954911433, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0002734164661278426, |
|
"loss": 0.4902, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.4106280193236715, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00027316696435747, |
|
"loss": 0.5504, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.4170692431561998, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00027291641214778937, |
|
"loss": 0.5234, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.423510466988728, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002726648116356554, |
|
"loss": 0.5052, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.4299516908212562, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.000272412164966863, |
|
"loss": 0.5189, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.4363929146537842, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00027215847429612965, |
|
"loss": 0.4982, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.4428341384863124, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002719037417870765, |
|
"loss": 0.4916, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.4492753623188406, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00027164796961221015, |
|
"loss": 0.5149, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4557165861513688, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002713911599529039, |
|
"loss": 0.5636, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.462157809983897, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00027113331499937967, |
|
"loss": 0.5191, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.4685990338164252, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00027087443695068873, |
|
"loss": 0.4786, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.4750402576489532, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002706145280146931, |
|
"loss": 0.5033, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00027035359040804703, |
|
"loss": 0.4753, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4879227053140096, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002700916263561778, |
|
"loss": 0.5255, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.4943639291465378, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0002698286380932667, |
|
"loss": 0.5472, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.500805152979066, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0002695646278622302, |
|
"loss": 0.4944, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.5072463768115942, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002692995979147007, |
|
"loss": 0.4677, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.5136876006441224, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00026903355051100734, |
|
"loss": 0.5152, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.5201288244766507, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002687664879201565, |
|
"loss": 0.5287, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.5265700483091789, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00026849841241981313, |
|
"loss": 0.5185, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.533011272141707, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00026822932629628034, |
|
"loss": 0.4925, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.539452495974235, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002679592318444808, |
|
"loss": 0.4938, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.5458937198067633, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0002676881313679366, |
|
"loss": 0.4962, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.5523349436392915, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002674160271787498, |
|
"loss": 0.4962, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.5587761674718197, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002671429215975828, |
|
"loss": 0.5142, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.5652173913043477, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00026686881695363833, |
|
"loss": 0.5361, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.5716586151368759, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002665937155846399, |
|
"loss": 0.519, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.578099838969404, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002663176198368114, |
|
"loss": 0.5055, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.5845410628019323, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0002660405320648576, |
|
"loss": 0.5256, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.5909822866344605, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002657624546319437, |
|
"loss": 0.5103, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.5974235104669887, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002654833899096753, |
|
"loss": 0.5249, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.603864734299517, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00026520334027807827, |
|
"loss": 0.4895, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.6103059581320451, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0002649223081255782, |
|
"loss": 0.5061, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6167471819645733, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00026464029584898036, |
|
"loss": 0.4781, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 1.6231884057971016, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00026435730585344896, |
|
"loss": 0.4885, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002640733405524869, |
|
"loss": 0.5188, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 1.636070853462158, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00026378840236791485, |
|
"loss": 0.5386, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 1.642512077294686, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.000263502493729851, |
|
"loss": 0.5438, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6489533011272142, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00026321561707668995, |
|
"loss": 0.5121, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 1.6553945249597424, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002629277748550823, |
|
"loss": 0.4868, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 1.6618357487922706, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002626389695199134, |
|
"loss": 0.5199, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 1.6682769726247986, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002623492035342826, |
|
"loss": 0.5424, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 1.6747181964573268, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00026205847936948244, |
|
"loss": 0.4983, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.681159420289855, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00026176679950497706, |
|
"loss": 0.5323, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 1.6876006441223832, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002614741664283816, |
|
"loss": 0.5964, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 1.6940418679549114, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00026118058263544056, |
|
"loss": 0.5227, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 1.7004830917874396, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00026088605063000696, |
|
"loss": 0.464, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 1.7069243156199678, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0002605905729240205, |
|
"loss": 0.4978, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.713365539452496, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00026029415203748633, |
|
"loss": 0.4983, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 1.7198067632850242, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0002599967904984539, |
|
"loss": 0.5166, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 1.7262479871175525, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00025969849084299466, |
|
"loss": 0.5683, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 1.7326892109500807, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00025939925561518126, |
|
"loss": 0.486, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002590990873670652, |
|
"loss": 0.4655, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.7455716586151369, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00025879798865865533, |
|
"loss": 0.4689, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 1.752012882447665, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002584959620578962, |
|
"loss": 0.424, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 1.7584541062801933, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00025819301014064574, |
|
"loss": 0.5134, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 1.7648953301127213, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002578891354906537, |
|
"loss": 0.4893, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 1.7713365539452495, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00025758434069953927, |
|
"loss": 0.4887, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0002572786283667692, |
|
"loss": 0.5153, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 1.7842190016103059, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00025697200109963563, |
|
"loss": 0.5056, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 1.790660225442834, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002566644615132337, |
|
"loss": 0.5319, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 1.7971014492753623, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00025635601223043933, |
|
"loss": 0.5182, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 1.8035426731078905, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.000256046655881887, |
|
"loss": 0.5028, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.8099838969404187, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.000255736395105947, |
|
"loss": 0.5006, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 1.816425120772947, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0002554252325487032, |
|
"loss": 0.5234, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 1.8228663446054751, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002551131708639303, |
|
"loss": 0.5544, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 1.8293075684380034, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00025480021271307156, |
|
"loss": 0.4766, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 1.8357487922705316, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00025448636076521534, |
|
"loss": 0.4615, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8421900161030595, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002541716176970732, |
|
"loss": 0.504, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 1.8486312399355878, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002538559861929566, |
|
"loss": 0.5873, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 1.855072463768116, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.000253539468944754, |
|
"loss": 0.5917, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 1.8615136876006442, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002532220686519081, |
|
"loss": 0.4924, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 1.8679549114331722, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00025290378802139273, |
|
"loss": 0.4582, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8743961352657004, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0002525846297676896, |
|
"loss": 0.5639, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 1.8808373590982286, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0002522645966127655, |
|
"loss": 0.5198, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 1.8872785829307568, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002519436912860488, |
|
"loss": 0.4766, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 1.893719806763285, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002516219165244062, |
|
"loss": 0.4583, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 1.9001610305958132, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0002512992750721195, |
|
"loss": 0.549, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.9066022544283414, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002509757696808622, |
|
"loss": 0.4792, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 1.9130434782608696, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0002506514031096758, |
|
"loss": 0.4834, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 1.9194847020933978, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00025032617812494664, |
|
"loss": 0.4969, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00025000009750038196, |
|
"loss": 0.5553, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 1.9323671497584543, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00024967316401698647, |
|
"loss": 0.536, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.9388083735909822, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00024934538046303856, |
|
"loss": 0.4848, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 1.9452495974235104, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002490167496340664, |
|
"loss": 0.4984, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 1.9516908212560387, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0002486872743328244, |
|
"loss": 0.4993, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 1.9581320450885669, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.000248356957369269, |
|
"loss": 0.5265, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 1.9645732689210949, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002480258015605349, |
|
"loss": 0.5287, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.971014492753623, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002476938097309108, |
|
"loss": 0.5616, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 1.9774557165861513, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002473609847118156, |
|
"loss": 0.4542, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 1.9838969404186795, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0002470273293417741, |
|
"loss": 0.4813, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 1.9903381642512077, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00024669284646639287, |
|
"loss": 0.5336, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 1.996779388083736, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00024635753893833585, |
|
"loss": 0.5528, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.003220611916264, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00024602140961730006, |
|
"loss": 0.4706, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 2.0096618357487923, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00024568446136999134, |
|
"loss": 0.4093, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 2.0161030595813205, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00024534669707009974, |
|
"loss": 0.3899, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 2.0225442834138487, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0002450081195982752, |
|
"loss": 0.361, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 2.028985507246377, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00024466873184210273, |
|
"loss": 0.3999, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.035426731078905, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00024432853669607786, |
|
"loss": 0.3753, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 2.0418679549114334, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00024398753706158225, |
|
"loss": 0.3951, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 2.0483091787439616, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00024364573584685848, |
|
"loss": 0.3791, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 2.0547504025764893, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00024330313596698553, |
|
"loss": 0.4148, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 2.0611916264090175, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00024295974034385396, |
|
"loss": 0.3767, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.0676328502415457, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00024261555190614072, |
|
"loss": 0.3743, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00024227057358928452, |
|
"loss": 0.3847, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 2.080515297906602, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00024192480833546044, |
|
"loss": 0.3627, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 2.0869565217391304, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00024157825909355523, |
|
"loss": 0.4324, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 2.0933977455716586, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0002412309288191417, |
|
"loss": 0.4302, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.099838969404187, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00024088282047445396, |
|
"loss": 0.3788, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 2.106280193236715, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00024053393702836185, |
|
"loss": 0.399, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 2.112721417069243, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0002401842814563457, |
|
"loss": 0.387, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 2.1191626409017714, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00023983385674047113, |
|
"loss": 0.3905, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 2.1256038647342996, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00023948266586936324, |
|
"loss": 0.3715, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.132045088566828, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00023913071183818155, |
|
"loss": 0.4474, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 2.138486312399356, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00023877799764859416, |
|
"loss": 0.3759, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 2.1449275362318843, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00023842452630875216, |
|
"loss": 0.373, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 2.151368760064412, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002380703008332643, |
|
"loss": 0.4218, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 2.1578099838969402, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0002377153242431708, |
|
"loss": 0.4234, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.1642512077294684, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00023735959956591786, |
|
"loss": 0.3971, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 2.1706924315619966, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0002370031298353319, |
|
"loss": 0.4211, |
|
"step": 1348 |
|
}, |
|
{ |
|
"epoch": 2.177133655394525, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00023664591809159353, |
|
"loss": 0.3972, |
|
"step": 1352 |
|
}, |
|
{ |
|
"epoch": 2.183574879227053, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00023628796738121169, |
|
"loss": 0.4185, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 2.1900161030595813, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00023592928075699763, |
|
"loss": 0.402, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.1964573268921095, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00023556986127803894, |
|
"loss": 0.4056, |
|
"step": 1364 |
|
}, |
|
{ |
|
"epoch": 2.2028985507246377, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00023520971200967334, |
|
"loss": 0.4506, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 2.209339774557166, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00023484883602346274, |
|
"loss": 0.4093, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 2.215780998389694, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0002344872363971668, |
|
"loss": 0.4717, |
|
"step": 1376 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00023412491621471694, |
|
"loss": 0.3948, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.2286634460547505, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00023376187856618972, |
|
"loss": 0.3925, |
|
"step": 1384 |
|
}, |
|
{ |
|
"epoch": 2.2351046698872787, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00023339812654778083, |
|
"loss": 0.4324, |
|
"step": 1388 |
|
}, |
|
{ |
|
"epoch": 2.241545893719807, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0002330336632617784, |
|
"loss": 0.4557, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 2.247987117552335, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00023266849181653683, |
|
"loss": 0.4301, |
|
"step": 1396 |
|
}, |
|
{ |
|
"epoch": 2.2544283413848634, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00023230261532644985, |
|
"loss": 0.3799, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.260869565217391, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002319360369119245, |
|
"loss": 0.3826, |
|
"step": 1404 |
|
}, |
|
{ |
|
"epoch": 2.2673107890499193, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00023156875969935405, |
|
"loss": 0.3862, |
|
"step": 1408 |
|
}, |
|
{ |
|
"epoch": 2.2737520128824475, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00023120078682109158, |
|
"loss": 0.4269, |
|
"step": 1412 |
|
}, |
|
{ |
|
"epoch": 2.2801932367149758, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00023083212141542328, |
|
"loss": 0.4139, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 2.286634460547504, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00023046276662654143, |
|
"loss": 0.3579, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.293075684380032, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00023009272560451803, |
|
"loss": 0.4, |
|
"step": 1424 |
|
}, |
|
{ |
|
"epoch": 2.2995169082125604, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00022972200150527745, |
|
"loss": 0.3937, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 2.3059581320450886, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00022935059749056992, |
|
"loss": 0.4553, |
|
"step": 1432 |
|
}, |
|
{ |
|
"epoch": 2.312399355877617, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00022897851672794417, |
|
"loss": 0.396, |
|
"step": 1436 |
|
}, |
|
{ |
|
"epoch": 2.318840579710145, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00022860576239072084, |
|
"loss": 0.5137, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.325281803542673, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00022823233765796502, |
|
"loss": 0.4085, |
|
"step": 1444 |
|
}, |
|
{ |
|
"epoch": 2.3317230273752014, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0002278582457144595, |
|
"loss": 0.3963, |
|
"step": 1448 |
|
}, |
|
{ |
|
"epoch": 2.3381642512077296, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00022748348975067733, |
|
"loss": 0.4377, |
|
"step": 1452 |
|
}, |
|
{ |
|
"epoch": 2.3446054750402574, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00022710807296275472, |
|
"loss": 0.4275, |
|
"step": 1456 |
|
}, |
|
{ |
|
"epoch": 2.3510466988727856, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0002267319985524637, |
|
"loss": 0.4089, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.357487922705314, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00022635526972718508, |
|
"loss": 0.4386, |
|
"step": 1464 |
|
}, |
|
{ |
|
"epoch": 2.363929146537842, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002259778896998807, |
|
"loss": 0.4172, |
|
"step": 1468 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00022559986168906637, |
|
"loss": 0.4022, |
|
"step": 1472 |
|
}, |
|
{ |
|
"epoch": 2.3768115942028984, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00022522118891878418, |
|
"loss": 0.4665, |
|
"step": 1476 |
|
}, |
|
{ |
|
"epoch": 2.3832528180354267, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00022484187461857517, |
|
"loss": 0.3916, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.389694041867955, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00022446192202345156, |
|
"loss": 0.3918, |
|
"step": 1484 |
|
}, |
|
{ |
|
"epoch": 2.396135265700483, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00022408133437386968, |
|
"loss": 0.4198, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 2.4025764895330113, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00022370011491570162, |
|
"loss": 0.3635, |
|
"step": 1492 |
|
}, |
|
{ |
|
"epoch": 2.4090177133655395, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.000223318266900208, |
|
"loss": 0.4297, |
|
"step": 1496 |
|
}, |
|
{ |
|
"epoch": 2.4154589371980677, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00022293579358401023, |
|
"loss": 0.3819, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.421900161030596, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002225526982290625, |
|
"loss": 0.4068, |
|
"step": 1504 |
|
}, |
|
{ |
|
"epoch": 2.428341384863124, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00022216898410262428, |
|
"loss": 0.3808, |
|
"step": 1508 |
|
}, |
|
{ |
|
"epoch": 2.4347826086956523, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00022178465447723214, |
|
"loss": 0.4037, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 2.4412238325281805, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.000221399712630672, |
|
"loss": 0.452, |
|
"step": 1516 |
|
}, |
|
{ |
|
"epoch": 2.4476650563607087, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0002210141618459513, |
|
"loss": 0.4127, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.454106280193237, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00022062800541127064, |
|
"loss": 0.3894, |
|
"step": 1524 |
|
}, |
|
{ |
|
"epoch": 2.4605475040257647, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00022024124661999613, |
|
"loss": 0.4256, |
|
"step": 1528 |
|
}, |
|
{ |
|
"epoch": 2.466988727858293, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00021985388877063104, |
|
"loss": 0.4556, |
|
"step": 1532 |
|
}, |
|
{ |
|
"epoch": 2.473429951690821, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00021946593516678777, |
|
"loss": 0.4504, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 2.4798711755233493, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00021907738911715964, |
|
"loss": 0.4062, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.4863123993558776, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00021868825393549275, |
|
"loss": 0.4386, |
|
"step": 1544 |
|
}, |
|
{ |
|
"epoch": 2.4927536231884058, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0002182985329405576, |
|
"loss": 0.3559, |
|
"step": 1548 |
|
}, |
|
{ |
|
"epoch": 2.499194847020934, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00021790822945612088, |
|
"loss": 0.4244, |
|
"step": 1552 |
|
}, |
|
{ |
|
"epoch": 2.505636070853462, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0002175173468109171, |
|
"loss": 0.4028, |
|
"step": 1556 |
|
}, |
|
{ |
|
"epoch": 2.5120772946859904, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00021712588833862014, |
|
"loss": 0.3726, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00021673385737781492, |
|
"loss": 0.481, |
|
"step": 1564 |
|
}, |
|
{ |
|
"epoch": 2.524959742351047, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00021634125727196883, |
|
"loss": 0.3778, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 2.531400966183575, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00021594809136940327, |
|
"loss": 0.4438, |
|
"step": 1572 |
|
}, |
|
{ |
|
"epoch": 2.537842190016103, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00021555436302326514, |
|
"loss": 0.4399, |
|
"step": 1576 |
|
}, |
|
{ |
|
"epoch": 2.544283413848631, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00021516007559149803, |
|
"loss": 0.3979, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.550724637681159, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00021476523243681397, |
|
"loss": 0.4085, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 2.5571658615136874, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0002143698369266643, |
|
"loss": 0.3875, |
|
"step": 1588 |
|
}, |
|
{ |
|
"epoch": 2.5636070853462156, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0002139738924332113, |
|
"loss": 0.4288, |
|
"step": 1592 |
|
}, |
|
{ |
|
"epoch": 2.570048309178744, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0002135774023332992, |
|
"loss": 0.4155, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 2.576489533011272, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00021318037000842558, |
|
"loss": 0.377, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.5829307568438002, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00021278279884471242, |
|
"loss": 0.4134, |
|
"step": 1604 |
|
}, |
|
{ |
|
"epoch": 2.5893719806763285, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0002123846922328771, |
|
"loss": 0.3668, |
|
"step": 1608 |
|
}, |
|
{ |
|
"epoch": 2.5958132045088567, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00021198605356820377, |
|
"loss": 0.4207, |
|
"step": 1612 |
|
}, |
|
{ |
|
"epoch": 2.602254428341385, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00021158688625051416, |
|
"loss": 0.434, |
|
"step": 1616 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00021118719368413866, |
|
"loss": 0.3963, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.6151368760064413, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0002107869792778873, |
|
"loss": 0.4366, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 2.6215780998389695, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00021038624644502063, |
|
"loss": 0.3604, |
|
"step": 1628 |
|
}, |
|
{ |
|
"epoch": 2.6280193236714977, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00020998499860322073, |
|
"loss": 0.4029, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 2.634460547504026, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00020958323917456186, |
|
"loss": 0.429, |
|
"step": 1636 |
|
}, |
|
{ |
|
"epoch": 2.640901771336554, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00020918097158548145, |
|
"loss": 0.426, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.6473429951690823, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002087781992667509, |
|
"loss": 0.4129, |
|
"step": 1644 |
|
}, |
|
{ |
|
"epoch": 2.6537842190016105, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.000208374925653446, |
|
"loss": 0.3946, |
|
"step": 1648 |
|
}, |
|
{ |
|
"epoch": 2.6602254428341388, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00020797115418491816, |
|
"loss": 0.3564, |
|
"step": 1652 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00020756688830476453, |
|
"loss": 0.4553, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 2.6731078904991947, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002071621314607991, |
|
"loss": 0.4497, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.679549114331723, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00020675688710502293, |
|
"loss": 0.3987, |
|
"step": 1664 |
|
}, |
|
{ |
|
"epoch": 2.685990338164251, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00020635115869359498, |
|
"loss": 0.4695, |
|
"step": 1668 |
|
}, |
|
{ |
|
"epoch": 2.6924315619967794, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0002059449496868024, |
|
"loss": 0.4566, |
|
"step": 1672 |
|
}, |
|
{ |
|
"epoch": 2.6988727858293076, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00020553826354903121, |
|
"loss": 0.4199, |
|
"step": 1676 |
|
}, |
|
{ |
|
"epoch": 2.7053140096618358, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00020513110374873676, |
|
"loss": 0.3612, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.711755233494364, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00020472347375841384, |
|
"loss": 0.383, |
|
"step": 1684 |
|
}, |
|
{ |
|
"epoch": 2.718196457326892, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002043153770545675, |
|
"loss": 0.4051, |
|
"step": 1688 |
|
}, |
|
{ |
|
"epoch": 2.7246376811594204, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00020390681711768312, |
|
"loss": 0.4408, |
|
"step": 1692 |
|
}, |
|
{ |
|
"epoch": 2.7310789049919486, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00020349779743219682, |
|
"loss": 0.4155, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 2.7375201288244764, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0002030883214864657, |
|
"loss": 0.4164, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.7439613526570046, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0002026783927727381, |
|
"loss": 0.4013, |
|
"step": 1704 |
|
}, |
|
{ |
|
"epoch": 2.750402576489533, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00020226801478712383, |
|
"loss": 0.3839, |
|
"step": 1708 |
|
}, |
|
{ |
|
"epoch": 2.756843800322061, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00020185719102956438, |
|
"loss": 0.4691, |
|
"step": 1712 |
|
}, |
|
{ |
|
"epoch": 2.763285024154589, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0002014459250038031, |
|
"loss": 0.3949, |
|
"step": 1716 |
|
}, |
|
{ |
|
"epoch": 2.7697262479871174, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00020103422021735507, |
|
"loss": 0.3918, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.7761674718196456, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00020062208018147755, |
|
"loss": 0.4027, |
|
"step": 1724 |
|
}, |
|
{ |
|
"epoch": 2.782608695652174, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00020020950841113984, |
|
"loss": 0.4319, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 2.789049919484702, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00019979650842499324, |
|
"loss": 0.4255, |
|
"step": 1732 |
|
}, |
|
{ |
|
"epoch": 2.7954911433172303, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019938308374534115, |
|
"loss": 0.4403, |
|
"step": 1736 |
|
}, |
|
{ |
|
"epoch": 2.8019323671497585, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019896923789810905, |
|
"loss": 0.4311, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.8083735909822867, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00019855497441281436, |
|
"loss": 0.407, |
|
"step": 1744 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019814029682253644, |
|
"loss": 0.4184, |
|
"step": 1748 |
|
}, |
|
{ |
|
"epoch": 2.821256038647343, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00019772520866388605, |
|
"loss": 0.3812, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 2.8276972624798713, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00019730971347697602, |
|
"loss": 0.4228, |
|
"step": 1756 |
|
}, |
|
{ |
|
"epoch": 2.8341384863123995, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019689381480539014, |
|
"loss": 0.4321, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.8405797101449277, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019647751619615353, |
|
"loss": 0.4321, |
|
"step": 1764 |
|
}, |
|
{ |
|
"epoch": 2.847020933977456, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00019606082119970214, |
|
"loss": 0.4502, |
|
"step": 1768 |
|
}, |
|
{ |
|
"epoch": 2.853462157809984, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019564373336985268, |
|
"loss": 0.4298, |
|
"step": 1772 |
|
}, |
|
{ |
|
"epoch": 2.8599033816425123, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019522625626377198, |
|
"loss": 0.4469, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 2.86634460547504, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00019480839344194695, |
|
"loss": 0.4033, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.8727858293075683, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019439014846815413, |
|
"loss": 0.4381, |
|
"step": 1784 |
|
}, |
|
{ |
|
"epoch": 2.8792270531400965, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019397152490942919, |
|
"loss": 0.4205, |
|
"step": 1788 |
|
}, |
|
{ |
|
"epoch": 2.8856682769726247, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00019355252633603668, |
|
"loss": 0.4187, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 2.892109500805153, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019313315632143944, |
|
"loss": 0.3912, |
|
"step": 1796 |
|
}, |
|
{ |
|
"epoch": 2.898550724637681, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019271341844226812, |
|
"loss": 0.4236, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.9049919484702094, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0001922933162782909, |
|
"loss": 0.3677, |
|
"step": 1804 |
|
}, |
|
{ |
|
"epoch": 2.9114331723027376, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019187285341238261, |
|
"loss": 0.3979, |
|
"step": 1808 |
|
}, |
|
{ |
|
"epoch": 2.917874396135266, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019145203343049453, |
|
"loss": 0.3967, |
|
"step": 1812 |
|
}, |
|
{ |
|
"epoch": 2.924315619967794, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019103085992162343, |
|
"loss": 0.4128, |
|
"step": 1816 |
|
}, |
|
{ |
|
"epoch": 2.930756843800322, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019060933647778135, |
|
"loss": 0.3968, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.9371980676328504, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019018746669396464, |
|
"loss": 0.4208, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 2.943639291465378, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018976525416812358, |
|
"loss": 0.413, |
|
"step": 1828 |
|
}, |
|
{ |
|
"epoch": 2.9500805152979064, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018934270250113135, |
|
"loss": 0.4122, |
|
"step": 1832 |
|
}, |
|
{ |
|
"epoch": 2.9565217391304346, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00018891981529675376, |
|
"loss": 0.3961, |
|
"step": 1836 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00018849659616161808, |
|
"loss": 0.4498, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.969404186795491, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00018807304870518263, |
|
"loss": 0.3935, |
|
"step": 1844 |
|
}, |
|
{ |
|
"epoch": 2.975845410628019, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00018764917653970567, |
|
"loss": 0.4183, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 2.9822866344605474, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.000187224983280215, |
|
"loss": 0.4101, |
|
"step": 1852 |
|
}, |
|
{ |
|
"epoch": 2.9887278582930756, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00018680047254447665, |
|
"loss": 0.4587, |
|
"step": 1856 |
|
}, |
|
{ |
|
"epoch": 2.995169082125604, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0001863756479529644, |
|
"loss": 0.4216, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.001610305958132, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00018595051312882892, |
|
"loss": 0.3842, |
|
"step": 1864 |
|
}, |
|
{ |
|
"epoch": 3.0080515297906603, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00018552507169786634, |
|
"loss": 0.3189, |
|
"step": 1868 |
|
}, |
|
{ |
|
"epoch": 3.0144927536231885, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00018509932728848804, |
|
"loss": 0.3061, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 3.0209339774557167, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00018467328353168934, |
|
"loss": 0.3166, |
|
"step": 1876 |
|
}, |
|
{ |
|
"epoch": 3.027375201288245, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00018424694406101838, |
|
"loss": 0.3081, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.033816425120773, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0001838203125125455, |
|
"loss": 0.2944, |
|
"step": 1884 |
|
}, |
|
{ |
|
"epoch": 3.0402576489533013, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00018339339252483196, |
|
"loss": 0.285, |
|
"step": 1888 |
|
}, |
|
{ |
|
"epoch": 3.0466988727858295, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00018296618773889912, |
|
"loss": 0.2926, |
|
"step": 1892 |
|
}, |
|
{ |
|
"epoch": 3.0531400966183573, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.000182538701798197, |
|
"loss": 0.3019, |
|
"step": 1896 |
|
}, |
|
{ |
|
"epoch": 3.0595813204508855, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00018211093834857379, |
|
"loss": 0.2984, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.0660225442834137, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018168290103824422, |
|
"loss": 0.3185, |
|
"step": 1904 |
|
}, |
|
{ |
|
"epoch": 3.072463768115942, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018125459351775873, |
|
"loss": 0.3192, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 3.07890499194847, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00018082601943997232, |
|
"loss": 0.3459, |
|
"step": 1912 |
|
}, |
|
{ |
|
"epoch": 3.0853462157809983, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00018039718246001325, |
|
"loss": 0.2837, |
|
"step": 1916 |
|
}, |
|
{ |
|
"epoch": 3.0917874396135265, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.000179968086235252, |
|
"loss": 0.3134, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.0982286634460547, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00017953873442527008, |
|
"loss": 0.2907, |
|
"step": 1924 |
|
}, |
|
{ |
|
"epoch": 3.104669887278583, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017910913069182872, |
|
"loss": 0.3076, |
|
"step": 1928 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00017867927869883775, |
|
"loss": 0.293, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 3.1175523349436394, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00017824918211232422, |
|
"loss": 0.2775, |
|
"step": 1936 |
|
}, |
|
{ |
|
"epoch": 3.1239935587761676, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00017781884460040136, |
|
"loss": 0.3037, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.130434782608696, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00017738826983323703, |
|
"loss": 0.3139, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 3.136876006441224, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00017695746148302252, |
|
"loss": 0.3081, |
|
"step": 1948 |
|
}, |
|
{ |
|
"epoch": 3.143317230273752, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00017652642322394142, |
|
"loss": 0.344, |
|
"step": 1952 |
|
}, |
|
{ |
|
"epoch": 3.14975845410628, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00017609515873213787, |
|
"loss": 0.3006, |
|
"step": 1956 |
|
}, |
|
{ |
|
"epoch": 3.156199677938808, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00017566367168568572, |
|
"loss": 0.2933, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.1626409017713364, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00017523196576455663, |
|
"loss": 0.2869, |
|
"step": 1964 |
|
}, |
|
{ |
|
"epoch": 3.1690821256038646, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00017480004465058918, |
|
"loss": 0.2935, |
|
"step": 1968 |
|
}, |
|
{ |
|
"epoch": 3.175523349436393, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00017436791202745706, |
|
"loss": 0.3451, |
|
"step": 1972 |
|
}, |
|
{ |
|
"epoch": 3.181964573268921, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00017393557158063803, |
|
"loss": 0.3047, |
|
"step": 1976 |
|
}, |
|
{ |
|
"epoch": 3.1884057971014492, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00017350302699738204, |
|
"loss": 0.327, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.1948470209339774, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00017307028196668028, |
|
"loss": 0.3238, |
|
"step": 1984 |
|
}, |
|
{ |
|
"epoch": 3.2012882447665056, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001726373401792333, |
|
"loss": 0.2957, |
|
"step": 1988 |
|
}, |
|
{ |
|
"epoch": 3.207729468599034, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00017220420532741977, |
|
"loss": 0.3124, |
|
"step": 1992 |
|
}, |
|
{ |
|
"epoch": 3.214170692431562, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00017177088110526486, |
|
"loss": 0.2852, |
|
"step": 1996 |
|
}, |
|
{ |
|
"epoch": 3.2206119162640903, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00017133737120840907, |
|
"loss": 0.3084, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.2270531400966185, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.000170903679334076, |
|
"loss": 0.2671, |
|
"step": 2004 |
|
}, |
|
{ |
|
"epoch": 3.2334943639291467, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00017046980918104164, |
|
"loss": 0.2851, |
|
"step": 2008 |
|
}, |
|
{ |
|
"epoch": 3.239935587761675, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001700357644496022, |
|
"loss": 0.2921, |
|
"step": 2012 |
|
}, |
|
{ |
|
"epoch": 3.246376811594203, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00016960154884154298, |
|
"loss": 0.2898, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 3.2528180354267313, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00016916716606010646, |
|
"loss": 0.3277, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.259259259259259, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00016873261980996095, |
|
"loss": 0.3301, |
|
"step": 2024 |
|
}, |
|
{ |
|
"epoch": 3.2657004830917873, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00016829791379716896, |
|
"loss": 0.3639, |
|
"step": 2028 |
|
}, |
|
{ |
|
"epoch": 3.2721417069243155, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00016786305172915544, |
|
"loss": 0.3492, |
|
"step": 2032 |
|
}, |
|
{ |
|
"epoch": 3.2785829307568437, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0001674280373146763, |
|
"loss": 0.3233, |
|
"step": 2036 |
|
}, |
|
{ |
|
"epoch": 3.285024154589372, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00016699287426378683, |
|
"loss": 0.3232, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.2914653784219, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001665575662878099, |
|
"loss": 0.326, |
|
"step": 2044 |
|
}, |
|
{ |
|
"epoch": 3.2979066022544283, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00016612211709930442, |
|
"loss": 0.3169, |
|
"step": 2048 |
|
}, |
|
{ |
|
"epoch": 3.3043478260869565, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00016568653041203356, |
|
"loss": 0.3036, |
|
"step": 2052 |
|
}, |
|
{ |
|
"epoch": 3.3107890499194848, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00016525080994093328, |
|
"loss": 0.2987, |
|
"step": 2056 |
|
}, |
|
{ |
|
"epoch": 3.317230273752013, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00016481495940208046, |
|
"loss": 0.3108, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.323671497584541, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001643789825126613, |
|
"loss": 0.3119, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 3.3301127214170694, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001639428829909396, |
|
"loss": 0.342, |
|
"step": 2068 |
|
}, |
|
{ |
|
"epoch": 3.3365539452495976, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00016350666455622497, |
|
"loss": 0.3025, |
|
"step": 2072 |
|
}, |
|
{ |
|
"epoch": 3.342995169082126, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0001630703309288412, |
|
"loss": 0.3136, |
|
"step": 2076 |
|
}, |
|
{ |
|
"epoch": 3.3494363929146536, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00016263388583009463, |
|
"loss": 0.2957, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.3558776167471818, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001621973329822421, |
|
"loss": 0.2948, |
|
"step": 2084 |
|
}, |
|
{ |
|
"epoch": 3.36231884057971, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00016176067610845958, |
|
"loss": 0.3298, |
|
"step": 2088 |
|
}, |
|
{ |
|
"epoch": 3.368760064412238, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00016132391893281003, |
|
"loss": 0.327, |
|
"step": 2092 |
|
}, |
|
{ |
|
"epoch": 3.3752012882447664, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001608870651802121, |
|
"loss": 0.3009, |
|
"step": 2096 |
|
}, |
|
{ |
|
"epoch": 3.3816425120772946, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00016045011857640783, |
|
"loss": 0.3148, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.388083735909823, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001600130828479314, |
|
"loss": 0.3282, |
|
"step": 2104 |
|
}, |
|
{ |
|
"epoch": 3.394524959742351, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001595759617220769, |
|
"loss": 0.3203, |
|
"step": 2108 |
|
}, |
|
{ |
|
"epoch": 3.4009661835748792, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00015913875892686685, |
|
"loss": 0.2977, |
|
"step": 2112 |
|
}, |
|
{ |
|
"epoch": 3.4074074074074074, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00015870147819102025, |
|
"loss": 0.2806, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 3.4138486312399356, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00015826412324392085, |
|
"loss": 0.3096, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.420289855072464, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00015782669781558528, |
|
"loss": 0.301, |
|
"step": 2124 |
|
}, |
|
{ |
|
"epoch": 3.426731078904992, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00015738920563663136, |
|
"loss": 0.3055, |
|
"step": 2128 |
|
}, |
|
{ |
|
"epoch": 3.4331723027375203, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00015695165043824605, |
|
"loss": 0.3187, |
|
"step": 2132 |
|
}, |
|
{ |
|
"epoch": 3.4396135265700485, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00015651403595215392, |
|
"loss": 0.308, |
|
"step": 2136 |
|
}, |
|
{ |
|
"epoch": 3.4460547504025767, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00015607636591058506, |
|
"loss": 0.3033, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.452495974235105, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001556386440462435, |
|
"loss": 0.3313, |
|
"step": 2144 |
|
}, |
|
{ |
|
"epoch": 3.4589371980676327, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001552008740922751, |
|
"loss": 0.2891, |
|
"step": 2148 |
|
}, |
|
{ |
|
"epoch": 3.465378421900161, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00015476305978223606, |
|
"loss": 0.3416, |
|
"step": 2152 |
|
}, |
|
{ |
|
"epoch": 3.471819645732689, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00015432520485006055, |
|
"loss": 0.2768, |
|
"step": 2156 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00015388731303002954, |
|
"loss": 0.3216, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.4847020933977455, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001534493880567384, |
|
"loss": 0.3112, |
|
"step": 2164 |
|
}, |
|
{ |
|
"epoch": 3.4911433172302737, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00015301143366506527, |
|
"loss": 0.323, |
|
"step": 2168 |
|
}, |
|
{ |
|
"epoch": 3.497584541062802, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00015257345359013928, |
|
"loss": 0.3406, |
|
"step": 2172 |
|
}, |
|
{ |
|
"epoch": 3.50402576489533, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00015213545156730847, |
|
"loss": 0.2904, |
|
"step": 2176 |
|
}, |
|
{ |
|
"epoch": 3.5104669887278583, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00015169743133210814, |
|
"loss": 0.3107, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.5169082125603865, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0001512593966202289, |
|
"loss": 0.3377, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 3.5233494363929148, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00015082135116748483, |
|
"loss": 0.3491, |
|
"step": 2188 |
|
}, |
|
{ |
|
"epoch": 3.529790660225443, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00015038329870978168, |
|
"loss": 0.2865, |
|
"step": 2192 |
|
}, |
|
{ |
|
"epoch": 3.536231884057971, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00014994524298308479, |
|
"loss": 0.2913, |
|
"step": 2196 |
|
}, |
|
{ |
|
"epoch": 3.542673107890499, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0001495071877233875, |
|
"loss": 0.3163, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.549114331723027, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00014906913666667913, |
|
"loss": 0.2722, |
|
"step": 2204 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00014863109354891317, |
|
"loss": 0.3163, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 3.5619967793880836, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014819306210597536, |
|
"loss": 0.3735, |
|
"step": 2212 |
|
}, |
|
{ |
|
"epoch": 3.5684380032206118, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00014775504607365196, |
|
"loss": 0.3303, |
|
"step": 2216 |
|
}, |
|
{ |
|
"epoch": 3.57487922705314, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00014731704918759765, |
|
"loss": 0.2946, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.581320450885668, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.000146879075183304, |
|
"loss": 0.3434, |
|
"step": 2224 |
|
}, |
|
{ |
|
"epoch": 3.5877616747181964, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00014644112779606727, |
|
"loss": 0.3063, |
|
"step": 2228 |
|
}, |
|
{ |
|
"epoch": 3.5942028985507246, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00014600321076095683, |
|
"loss": 0.2962, |
|
"step": 2232 |
|
}, |
|
{ |
|
"epoch": 3.600644122383253, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00014556532781278316, |
|
"loss": 0.3006, |
|
"step": 2236 |
|
}, |
|
{ |
|
"epoch": 3.607085346215781, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00014512748268606592, |
|
"loss": 0.3688, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.6135265700483092, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014468967911500242, |
|
"loss": 0.3348, |
|
"step": 2244 |
|
}, |
|
{ |
|
"epoch": 3.6199677938808374, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001442519208334353, |
|
"loss": 0.3128, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 3.6264090177133657, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00014381421157482125, |
|
"loss": 0.3488, |
|
"step": 2252 |
|
}, |
|
{ |
|
"epoch": 3.632850241545894, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001433765550721985, |
|
"loss": 0.2614, |
|
"step": 2256 |
|
}, |
|
{ |
|
"epoch": 3.639291465378422, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00014293895505815575, |
|
"loss": 0.2984, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.6457326892109503, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00014250141526479953, |
|
"loss": 0.3257, |
|
"step": 2264 |
|
}, |
|
{ |
|
"epoch": 3.6521739130434785, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00014206393942372314, |
|
"loss": 0.3235, |
|
"step": 2268 |
|
}, |
|
{ |
|
"epoch": 3.6586151368760067, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0001416265312659741, |
|
"loss": 0.3435, |
|
"step": 2272 |
|
}, |
|
{ |
|
"epoch": 3.6650563607085345, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00014118919452202306, |
|
"loss": 0.3191, |
|
"step": 2276 |
|
}, |
|
{ |
|
"epoch": 3.6714975845410627, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00014075193292173126, |
|
"loss": 0.2869, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.677938808373591, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00014031475019431934, |
|
"loss": 0.3089, |
|
"step": 2284 |
|
}, |
|
{ |
|
"epoch": 3.684380032206119, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00013987765006833518, |
|
"loss": 0.3332, |
|
"step": 2288 |
|
}, |
|
{ |
|
"epoch": 3.6908212560386473, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001394406362716221, |
|
"loss": 0.3127, |
|
"step": 2292 |
|
}, |
|
{ |
|
"epoch": 3.6972624798711755, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00013900371253128727, |
|
"loss": 0.3177, |
|
"step": 2296 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0001385668825736697, |
|
"loss": 0.3324, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.710144927536232, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001381301501243087, |
|
"loss": 0.2785, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 3.71658615136876, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00013769351890791185, |
|
"loss": 0.3274, |
|
"step": 2308 |
|
}, |
|
{ |
|
"epoch": 3.7230273752012883, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00013725699264832344, |
|
"loss": 0.3041, |
|
"step": 2312 |
|
}, |
|
{ |
|
"epoch": 3.7294685990338166, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00013682057506849256, |
|
"loss": 0.3343, |
|
"step": 2316 |
|
}, |
|
{ |
|
"epoch": 3.7359098228663448, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00013638426989044148, |
|
"loss": 0.2785, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.7423510466988725, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00013594808083523376, |
|
"loss": 0.3454, |
|
"step": 2324 |
|
}, |
|
{ |
|
"epoch": 3.7487922705314007, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00013551201162294275, |
|
"loss": 0.312, |
|
"step": 2328 |
|
}, |
|
{ |
|
"epoch": 3.755233494363929, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00013507606597261946, |
|
"loss": 0.2885, |
|
"step": 2332 |
|
}, |
|
{ |
|
"epoch": 3.761674718196457, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00013464024760226142, |
|
"loss": 0.3328, |
|
"step": 2336 |
|
}, |
|
{ |
|
"epoch": 3.7681159420289854, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0001342045602287803, |
|
"loss": 0.3078, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.7745571658615136, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00013376900756797085, |
|
"loss": 0.3126, |
|
"step": 2344 |
|
}, |
|
{ |
|
"epoch": 3.780998389694042, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00013333359333447865, |
|
"loss": 0.2941, |
|
"step": 2348 |
|
}, |
|
{ |
|
"epoch": 3.78743961352657, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0001328983212417689, |
|
"loss": 0.3251, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 3.793880837359098, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001324631950020945, |
|
"loss": 0.3367, |
|
"step": 2356 |
|
}, |
|
{ |
|
"epoch": 3.8003220611916264, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0001320282183264643, |
|
"loss": 0.3164, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.8067632850241546, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00013159339492461176, |
|
"loss": 0.3584, |
|
"step": 2364 |
|
}, |
|
{ |
|
"epoch": 3.813204508856683, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00013115872850496293, |
|
"loss": 0.3307, |
|
"step": 2368 |
|
}, |
|
{ |
|
"epoch": 3.819645732689211, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001307242227746053, |
|
"loss": 0.3475, |
|
"step": 2372 |
|
}, |
|
{ |
|
"epoch": 3.8260869565217392, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00013028988143925553, |
|
"loss": 0.3058, |
|
"step": 2376 |
|
}, |
|
{ |
|
"epoch": 3.8325281803542675, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00012985570820322868, |
|
"loss": 0.2718, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.8389694041867957, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00012942170676940576, |
|
"loss": 0.3074, |
|
"step": 2384 |
|
}, |
|
{ |
|
"epoch": 3.845410628019324, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00012898788083920282, |
|
"loss": 0.3177, |
|
"step": 2388 |
|
}, |
|
{ |
|
"epoch": 3.851851851851852, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001285542341125389, |
|
"loss": 0.3012, |
|
"step": 2392 |
|
}, |
|
{ |
|
"epoch": 3.8582930756843803, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001281207702878049, |
|
"loss": 0.3024, |
|
"step": 2396 |
|
}, |
|
{ |
|
"epoch": 3.864734299516908, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00012768749306183165, |
|
"loss": 0.3092, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.8711755233494363, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00012725440612985868, |
|
"loss": 0.2978, |
|
"step": 2404 |
|
}, |
|
{ |
|
"epoch": 3.8776167471819645, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0001268215131855025, |
|
"loss": 0.3337, |
|
"step": 2408 |
|
}, |
|
{ |
|
"epoch": 3.8840579710144927, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00012638881792072522, |
|
"loss": 0.3278, |
|
"step": 2412 |
|
}, |
|
{ |
|
"epoch": 3.890499194847021, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00012595632402580305, |
|
"loss": 0.3051, |
|
"step": 2416 |
|
}, |
|
{ |
|
"epoch": 3.896940418679549, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00012552403518929472, |
|
"loss": 0.2764, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.9033816425120773, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0001250919550980102, |
|
"loss": 0.3124, |
|
"step": 2424 |
|
}, |
|
{ |
|
"epoch": 3.9098228663446055, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00012466008743697906, |
|
"loss": 0.3407, |
|
"step": 2428 |
|
}, |
|
{ |
|
"epoch": 3.9162640901771337, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00012422843588941925, |
|
"loss": 0.3336, |
|
"step": 2432 |
|
}, |
|
{ |
|
"epoch": 3.922705314009662, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00012379700413670547, |
|
"loss": 0.2992, |
|
"step": 2436 |
|
}, |
|
{ |
|
"epoch": 3.92914653784219, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00012336579585833798, |
|
"loss": 0.3341, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.9355877616747184, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00012293481473191103, |
|
"loss": 0.3153, |
|
"step": 2444 |
|
}, |
|
{ |
|
"epoch": 3.942028985507246, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00012250406443308168, |
|
"loss": 0.2993, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 3.9484702093397743, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00012207354863553825, |
|
"loss": 0.3144, |
|
"step": 2452 |
|
}, |
|
{ |
|
"epoch": 3.9549114331723025, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00012164327101096923, |
|
"loss": 0.3251, |
|
"step": 2456 |
|
}, |
|
{ |
|
"epoch": 3.9613526570048307, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00012121323522903167, |
|
"loss": 0.2799, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.967793880837359, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00012078344495732028, |
|
"loss": 0.3188, |
|
"step": 2464 |
|
}, |
|
{ |
|
"epoch": 3.974235104669887, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00012035390386133558, |
|
"loss": 0.3052, |
|
"step": 2468 |
|
}, |
|
{ |
|
"epoch": 3.9806763285024154, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00011992461560445337, |
|
"loss": 0.2771, |
|
"step": 2472 |
|
}, |
|
{ |
|
"epoch": 3.9871175523349436, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00011949558384789271, |
|
"loss": 0.3164, |
|
"step": 2476 |
|
}, |
|
{ |
|
"epoch": 3.993558776167472, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00011906681225068535, |
|
"loss": 0.2902, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00011863830446964417, |
|
"loss": 0.3142, |
|
"step": 2484 |
|
}, |
|
{ |
|
"epoch": 4.006441223832528, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00011821006415933199, |
|
"loss": 0.2147, |
|
"step": 2488 |
|
}, |
|
{ |
|
"epoch": 4.012882447665056, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00011778209497203062, |
|
"loss": 0.2092, |
|
"step": 2492 |
|
}, |
|
{ |
|
"epoch": 4.019323671497585, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00011735440055770945, |
|
"loss": 0.2548, |
|
"step": 2496 |
|
}, |
|
{ |
|
"epoch": 4.025764895330113, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00011692698456399458, |
|
"loss": 0.2183, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.032206119162641, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001164998506361374, |
|
"loss": 0.2009, |
|
"step": 2504 |
|
}, |
|
{ |
|
"epoch": 4.038647342995169, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00011607300241698387, |
|
"loss": 0.218, |
|
"step": 2508 |
|
}, |
|
{ |
|
"epoch": 4.0450885668276975, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00011564644354694312, |
|
"loss": 0.2201, |
|
"step": 2512 |
|
}, |
|
{ |
|
"epoch": 4.051529790660226, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00011522017766395665, |
|
"loss": 0.2078, |
|
"step": 2516 |
|
}, |
|
{ |
|
"epoch": 4.057971014492754, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00011479420840346706, |
|
"loss": 0.1932, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.064412238325282, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00011436853939838734, |
|
"loss": 0.2217, |
|
"step": 2524 |
|
}, |
|
{ |
|
"epoch": 4.07085346215781, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001139431742790696, |
|
"loss": 0.2448, |
|
"step": 2528 |
|
}, |
|
{ |
|
"epoch": 4.0772946859903385, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001135181166732743, |
|
"loss": 0.2254, |
|
"step": 2532 |
|
}, |
|
{ |
|
"epoch": 4.083735909822867, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00011309337020613922, |
|
"loss": 0.2665, |
|
"step": 2536 |
|
}, |
|
{ |
|
"epoch": 4.090177133655395, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001126689385001486, |
|
"loss": 0.2365, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.096618357487923, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00011224482517510224, |
|
"loss": 0.2341, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 4.1030595813204505, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00011182103384808444, |
|
"loss": 0.2015, |
|
"step": 2548 |
|
}, |
|
{ |
|
"epoch": 4.109500805152979, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00011139756813343359, |
|
"loss": 0.2334, |
|
"step": 2552 |
|
}, |
|
{ |
|
"epoch": 4.115942028985507, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00011097443164271075, |
|
"loss": 0.246, |
|
"step": 2556 |
|
}, |
|
{ |
|
"epoch": 4.122383252818035, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00011055162798466948, |
|
"loss": 0.2322, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.128824476650563, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00011012916076522443, |
|
"loss": 0.2178, |
|
"step": 2564 |
|
}, |
|
{ |
|
"epoch": 4.1352657004830915, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00010970703358742127, |
|
"loss": 0.2147, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 4.14170692431562, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00010928525005140521, |
|
"loss": 0.2315, |
|
"step": 2572 |
|
}, |
|
{ |
|
"epoch": 4.148148148148148, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00010886381375439105, |
|
"loss": 0.2284, |
|
"step": 2576 |
|
}, |
|
{ |
|
"epoch": 4.154589371980676, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001084427282906318, |
|
"loss": 0.2568, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.161030595813204, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00010802199725138869, |
|
"loss": 0.2163, |
|
"step": 2584 |
|
}, |
|
{ |
|
"epoch": 4.1674718196457325, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00010760162422489987, |
|
"loss": 0.2267, |
|
"step": 2588 |
|
}, |
|
{ |
|
"epoch": 4.173913043478261, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00010718161279635048, |
|
"loss": 0.2263, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 4.180354267310789, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00010676196654784144, |
|
"loss": 0.2395, |
|
"step": 2596 |
|
}, |
|
{ |
|
"epoch": 4.186795491143317, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00010634268905835949, |
|
"loss": 0.2454, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.193236714975845, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00010592378390374612, |
|
"loss": 0.2186, |
|
"step": 2604 |
|
}, |
|
{ |
|
"epoch": 4.199677938808374, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00010550525465666751, |
|
"loss": 0.2302, |
|
"step": 2608 |
|
}, |
|
{ |
|
"epoch": 4.206119162640902, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00010508710488658385, |
|
"loss": 0.2475, |
|
"step": 2612 |
|
}, |
|
{ |
|
"epoch": 4.21256038647343, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00010466933815971884, |
|
"loss": 0.1988, |
|
"step": 2616 |
|
}, |
|
{ |
|
"epoch": 4.219001610305958, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00010425195803902948, |
|
"loss": 0.2137, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.225442834138486, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00010383496808417547, |
|
"loss": 0.2564, |
|
"step": 2624 |
|
}, |
|
{ |
|
"epoch": 4.231884057971015, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00010341837185148903, |
|
"loss": 0.2361, |
|
"step": 2628 |
|
}, |
|
{ |
|
"epoch": 4.238325281803543, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00010300217289394443, |
|
"loss": 0.2324, |
|
"step": 2632 |
|
}, |
|
{ |
|
"epoch": 4.244766505636071, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00010258637476112782, |
|
"loss": 0.2175, |
|
"step": 2636 |
|
}, |
|
{ |
|
"epoch": 4.251207729468599, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00010217098099920676, |
|
"loss": 0.2533, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.2576489533011275, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00010175599515090026, |
|
"loss": 0.2155, |
|
"step": 2644 |
|
}, |
|
{ |
|
"epoch": 4.264090177133656, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00010134142075544824, |
|
"loss": 0.2299, |
|
"step": 2648 |
|
}, |
|
{ |
|
"epoch": 4.270531400966184, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00010092726134858168, |
|
"loss": 0.2776, |
|
"step": 2652 |
|
}, |
|
{ |
|
"epoch": 4.276972624798712, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00010051352046249213, |
|
"loss": 0.2079, |
|
"step": 2656 |
|
}, |
|
{ |
|
"epoch": 4.28341384863124, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00010010020162580192, |
|
"loss": 0.198, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.2898550724637685, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 9.96873083635337e-05, |
|
"loss": 0.223, |
|
"step": 2664 |
|
}, |
|
{ |
|
"epoch": 4.296296296296296, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 9.927484419708076e-05, |
|
"loss": 0.187, |
|
"step": 2668 |
|
}, |
|
{ |
|
"epoch": 4.302737520128824, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 9.88628126441768e-05, |
|
"loss": 0.2339, |
|
"step": 2672 |
|
}, |
|
{ |
|
"epoch": 4.309178743961352, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 9.84512172188657e-05, |
|
"loss": 0.2164, |
|
"step": 2676 |
|
}, |
|
{ |
|
"epoch": 4.3156199677938805, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.804006143147212e-05, |
|
"loss": 0.2328, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.322061191626409, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 9.762934878857105e-05, |
|
"loss": 0.2577, |
|
"step": 2684 |
|
}, |
|
{ |
|
"epoch": 4.328502415458937, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 9.721908279295812e-05, |
|
"loss": 0.2256, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 4.334943639291465, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 9.680926694361964e-05, |
|
"loss": 0.2344, |
|
"step": 2692 |
|
}, |
|
{ |
|
"epoch": 4.341384863123993, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.639990473570294e-05, |
|
"loss": 0.2238, |
|
"step": 2696 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.599099966048627e-05, |
|
"loss": 0.1847, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.35426731078905, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.558255520534937e-05, |
|
"loss": 0.2451, |
|
"step": 2704 |
|
}, |
|
{ |
|
"epoch": 4.360708534621578, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 9.517457485374336e-05, |
|
"loss": 0.2112, |
|
"step": 2708 |
|
}, |
|
{ |
|
"epoch": 4.367149758454106, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 9.476706208516138e-05, |
|
"loss": 0.2048, |
|
"step": 2712 |
|
}, |
|
{ |
|
"epoch": 4.373590982286634, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 9.43600203751086e-05, |
|
"loss": 0.2036, |
|
"step": 2716 |
|
}, |
|
{ |
|
"epoch": 4.3800322061191626, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.395345319507287e-05, |
|
"loss": 0.2125, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.386473429951691, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 9.354736401249486e-05, |
|
"loss": 0.2199, |
|
"step": 2724 |
|
}, |
|
{ |
|
"epoch": 4.392914653784219, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 9.31417562907387e-05, |
|
"loss": 0.2064, |
|
"step": 2728 |
|
}, |
|
{ |
|
"epoch": 4.399355877616747, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 9.273663348906222e-05, |
|
"loss": 0.2183, |
|
"step": 2732 |
|
}, |
|
{ |
|
"epoch": 4.405797101449275, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 9.233199906258766e-05, |
|
"loss": 0.2639, |
|
"step": 2736 |
|
}, |
|
{ |
|
"epoch": 4.412238325281804, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 9.192785646227217e-05, |
|
"loss": 0.251, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.418679549114332, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 9.152420913487814e-05, |
|
"loss": 0.2386, |
|
"step": 2744 |
|
}, |
|
{ |
|
"epoch": 4.42512077294686, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 9.112106052294418e-05, |
|
"loss": 0.217, |
|
"step": 2748 |
|
}, |
|
{ |
|
"epoch": 4.431561996779388, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.071841406475539e-05, |
|
"loss": 0.2102, |
|
"step": 2752 |
|
}, |
|
{ |
|
"epoch": 4.438003220611916, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 9.03162731943144e-05, |
|
"loss": 0.2282, |
|
"step": 2756 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 8.991464134131166e-05, |
|
"loss": 0.2395, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.450885668276973, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 8.951352193109673e-05, |
|
"loss": 0.2379, |
|
"step": 2764 |
|
}, |
|
{ |
|
"epoch": 4.457326892109501, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.911291838464838e-05, |
|
"loss": 0.2319, |
|
"step": 2768 |
|
}, |
|
{ |
|
"epoch": 4.463768115942029, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.871283411854619e-05, |
|
"loss": 0.2066, |
|
"step": 2772 |
|
}, |
|
{ |
|
"epoch": 4.4702093397745575, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.831327254494066e-05, |
|
"loss": 0.2086, |
|
"step": 2776 |
|
}, |
|
{ |
|
"epoch": 4.476650563607086, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 8.791423707152482e-05, |
|
"loss": 0.2454, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.483091787439614, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 8.751573110150443e-05, |
|
"loss": 0.2254, |
|
"step": 2784 |
|
}, |
|
{ |
|
"epoch": 4.489533011272142, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.711775803356971e-05, |
|
"loss": 0.2446, |
|
"step": 2788 |
|
}, |
|
{ |
|
"epoch": 4.49597423510467, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 8.672032126186566e-05, |
|
"loss": 0.2154, |
|
"step": 2792 |
|
}, |
|
{ |
|
"epoch": 4.5024154589371985, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.632342417596365e-05, |
|
"loss": 0.2269, |
|
"step": 2796 |
|
}, |
|
{ |
|
"epoch": 4.508856682769727, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 8.592707016083221e-05, |
|
"loss": 0.2134, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.515297906602254, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 8.553126259680828e-05, |
|
"loss": 0.25, |
|
"step": 2804 |
|
}, |
|
{ |
|
"epoch": 4.521739130434782, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.513600485956835e-05, |
|
"loss": 0.2262, |
|
"step": 2808 |
|
}, |
|
{ |
|
"epoch": 4.5281803542673105, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 8.474130032009951e-05, |
|
"loss": 0.2062, |
|
"step": 2812 |
|
}, |
|
{ |
|
"epoch": 4.534621578099839, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 8.434715234467123e-05, |
|
"loss": 0.2293, |
|
"step": 2816 |
|
}, |
|
{ |
|
"epoch": 4.541062801932367, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 8.395356429480587e-05, |
|
"loss": 0.204, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.547504025764895, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 8.356053952725072e-05, |
|
"loss": 0.2207, |
|
"step": 2824 |
|
}, |
|
{ |
|
"epoch": 4.553945249597423, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.316808139394876e-05, |
|
"loss": 0.2231, |
|
"step": 2828 |
|
}, |
|
{ |
|
"epoch": 4.5603864734299515, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 8.277619324201081e-05, |
|
"loss": 0.2321, |
|
"step": 2832 |
|
}, |
|
{ |
|
"epoch": 4.56682769726248, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 8.238487841368617e-05, |
|
"loss": 0.2298, |
|
"step": 2836 |
|
}, |
|
{ |
|
"epoch": 4.573268921095008, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 8.199414024633473e-05, |
|
"loss": 0.1997, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.579710144927536, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 8.160398207239805e-05, |
|
"loss": 0.2359, |
|
"step": 2844 |
|
}, |
|
{ |
|
"epoch": 4.586151368760064, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 8.121440721937157e-05, |
|
"loss": 0.216, |
|
"step": 2848 |
|
}, |
|
{ |
|
"epoch": 4.592592592592593, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 8.082541900977542e-05, |
|
"loss": 0.2374, |
|
"step": 2852 |
|
}, |
|
{ |
|
"epoch": 4.599033816425121, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.04370207611267e-05, |
|
"loss": 0.2363, |
|
"step": 2856 |
|
}, |
|
{ |
|
"epoch": 4.605475040257649, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 8.004921578591091e-05, |
|
"loss": 0.214, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.611916264090177, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 7.966200739155389e-05, |
|
"loss": 0.2214, |
|
"step": 2864 |
|
}, |
|
{ |
|
"epoch": 4.618357487922705, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.927539888039339e-05, |
|
"loss": 0.2431, |
|
"step": 2868 |
|
}, |
|
{ |
|
"epoch": 4.624798711755234, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 7.888939354965093e-05, |
|
"loss": 0.2104, |
|
"step": 2872 |
|
}, |
|
{ |
|
"epoch": 4.631239935587762, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 7.850399469140393e-05, |
|
"loss": 0.204, |
|
"step": 2876 |
|
}, |
|
{ |
|
"epoch": 4.63768115942029, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 7.811920559255736e-05, |
|
"loss": 0.2263, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.644122383252818, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 7.773502953481585e-05, |
|
"loss": 0.2161, |
|
"step": 2884 |
|
}, |
|
{ |
|
"epoch": 4.650563607085346, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 7.73514697946556e-05, |
|
"loss": 0.2279, |
|
"step": 2888 |
|
}, |
|
{ |
|
"epoch": 4.657004830917875, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 7.696852964329655e-05, |
|
"loss": 0.2615, |
|
"step": 2892 |
|
}, |
|
{ |
|
"epoch": 4.663446054750403, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 7.658621234667443e-05, |
|
"loss": 0.2407, |
|
"step": 2896 |
|
}, |
|
{ |
|
"epoch": 4.669887278582931, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 7.620452116541291e-05, |
|
"loss": 0.2101, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.676328502415459, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 7.582345935479569e-05, |
|
"loss": 0.2191, |
|
"step": 2904 |
|
}, |
|
{ |
|
"epoch": 4.6827697262479875, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 7.544303016473894e-05, |
|
"loss": 0.2159, |
|
"step": 2908 |
|
}, |
|
{ |
|
"epoch": 4.689210950080515, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 7.506323683976344e-05, |
|
"loss": 0.2251, |
|
"step": 2912 |
|
}, |
|
{ |
|
"epoch": 4.695652173913043, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 7.468408261896701e-05, |
|
"loss": 0.1935, |
|
"step": 2916 |
|
}, |
|
{ |
|
"epoch": 4.702093397745571, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 7.430557073599662e-05, |
|
"loss": 0.2123, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.708534621578099, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 7.392770441902116e-05, |
|
"loss": 0.2466, |
|
"step": 2924 |
|
}, |
|
{ |
|
"epoch": 4.714975845410628, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.355048689070389e-05, |
|
"loss": 0.2332, |
|
"step": 2928 |
|
}, |
|
{ |
|
"epoch": 4.721417069243156, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 7.317392136817453e-05, |
|
"loss": 0.2364, |
|
"step": 2932 |
|
}, |
|
{ |
|
"epoch": 4.727858293075684, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 7.279801106300231e-05, |
|
"loss": 0.2662, |
|
"step": 2936 |
|
}, |
|
{ |
|
"epoch": 4.734299516908212, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 7.242275918116832e-05, |
|
"loss": 0.2174, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.7407407407407405, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.204816892303833e-05, |
|
"loss": 0.2135, |
|
"step": 2944 |
|
}, |
|
{ |
|
"epoch": 4.747181964573269, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 7.16742434833352e-05, |
|
"loss": 0.231, |
|
"step": 2948 |
|
}, |
|
{ |
|
"epoch": 4.753623188405797, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.1300986051112e-05, |
|
"loss": 0.2569, |
|
"step": 2952 |
|
}, |
|
{ |
|
"epoch": 4.760064412238325, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 7.09283998097246e-05, |
|
"loss": 0.2072, |
|
"step": 2956 |
|
}, |
|
{ |
|
"epoch": 4.766505636070853, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 7.055648793680466e-05, |
|
"loss": 0.2059, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.7729468599033815, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.018525360423217e-05, |
|
"loss": 0.2429, |
|
"step": 2964 |
|
}, |
|
{ |
|
"epoch": 4.77938808373591, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 6.981469997810892e-05, |
|
"loss": 0.2203, |
|
"step": 2968 |
|
}, |
|
{ |
|
"epoch": 4.785829307568438, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 6.944483021873115e-05, |
|
"loss": 0.232, |
|
"step": 2972 |
|
}, |
|
{ |
|
"epoch": 4.792270531400966, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 6.907564748056273e-05, |
|
"loss": 0.2124, |
|
"step": 2976 |
|
}, |
|
{ |
|
"epoch": 4.798711755233494, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.870715491220808e-05, |
|
"loss": 0.2184, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.805152979066023, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 6.833935565638559e-05, |
|
"loss": 0.238, |
|
"step": 2984 |
|
}, |
|
{ |
|
"epoch": 4.811594202898551, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 6.797225284990064e-05, |
|
"loss": 0.2283, |
|
"step": 2988 |
|
}, |
|
{ |
|
"epoch": 4.818035426731079, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 6.760584962361888e-05, |
|
"loss": 0.2351, |
|
"step": 2992 |
|
}, |
|
{ |
|
"epoch": 4.824476650563607, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 6.72401491024396e-05, |
|
"loss": 0.2019, |
|
"step": 2996 |
|
}, |
|
{ |
|
"epoch": 4.830917874396135, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 6.687515440526882e-05, |
|
"loss": 0.242, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.837359098228664, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 6.651086864499305e-05, |
|
"loss": 0.2196, |
|
"step": 3004 |
|
}, |
|
{ |
|
"epoch": 4.843800322061192, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 6.614729492845258e-05, |
|
"loss": 0.2146, |
|
"step": 3008 |
|
}, |
|
{ |
|
"epoch": 4.85024154589372, |
|
"grad_norm": 0.375, |
|
"learning_rate": 6.578443635641497e-05, |
|
"loss": 0.2232, |
|
"step": 3012 |
|
}, |
|
{ |
|
"epoch": 4.856682769726248, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.542229602354847e-05, |
|
"loss": 0.2319, |
|
"step": 3016 |
|
}, |
|
{ |
|
"epoch": 4.8631239935587764, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.506087701839593e-05, |
|
"loss": 0.2156, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.869565217391305, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 6.470018242334825e-05, |
|
"loss": 0.2372, |
|
"step": 3024 |
|
}, |
|
{ |
|
"epoch": 4.876006441223833, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 6.434021531461818e-05, |
|
"loss": 0.2077, |
|
"step": 3028 |
|
}, |
|
{ |
|
"epoch": 4.882447665056361, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 6.398097876221385e-05, |
|
"loss": 0.2183, |
|
"step": 3032 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 6.362247582991317e-05, |
|
"loss": 0.2104, |
|
"step": 3036 |
|
}, |
|
{ |
|
"epoch": 4.8953301127214175, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 6.326470957523686e-05, |
|
"loss": 0.2048, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.901771336553946, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 6.29076830494232e-05, |
|
"loss": 0.2346, |
|
"step": 3044 |
|
}, |
|
{ |
|
"epoch": 4.908212560386474, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.255139929740129e-05, |
|
"loss": 0.2068, |
|
"step": 3048 |
|
}, |
|
{ |
|
"epoch": 4.914653784219001, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.219586135776575e-05, |
|
"loss": 0.239, |
|
"step": 3052 |
|
}, |
|
{ |
|
"epoch": 4.921095008051529, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 6.184107226275038e-05, |
|
"loss": 0.1814, |
|
"step": 3056 |
|
}, |
|
{ |
|
"epoch": 4.927536231884058, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.148703503820224e-05, |
|
"loss": 0.2272, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.933977455716586, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 6.113375270355617e-05, |
|
"loss": 0.2418, |
|
"step": 3064 |
|
}, |
|
{ |
|
"epoch": 4.940418679549114, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 6.078122827180879e-05, |
|
"loss": 0.2723, |
|
"step": 3068 |
|
}, |
|
{ |
|
"epoch": 4.946859903381642, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 6.042946474949302e-05, |
|
"loss": 0.2407, |
|
"step": 3072 |
|
}, |
|
{ |
|
"epoch": 4.9533011272141705, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 6.007846513665207e-05, |
|
"loss": 0.2153, |
|
"step": 3076 |
|
}, |
|
{ |
|
"epoch": 4.959742351046699, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 5.972823242681426e-05, |
|
"loss": 0.2206, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.966183574879227, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 5.937876960696727e-05, |
|
"loss": 0.2105, |
|
"step": 3084 |
|
}, |
|
{ |
|
"epoch": 4.972624798711755, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 5.903007965753279e-05, |
|
"loss": 0.2526, |
|
"step": 3088 |
|
}, |
|
{ |
|
"epoch": 4.979066022544283, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.868216555234081e-05, |
|
"loss": 0.2168, |
|
"step": 3092 |
|
}, |
|
{ |
|
"epoch": 4.9855072463768115, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 5.833503025860469e-05, |
|
"loss": 0.2174, |
|
"step": 3096 |
|
}, |
|
{ |
|
"epoch": 4.99194847020934, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 5.798867673689553e-05, |
|
"loss": 0.2365, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.998389694041868, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 5.764310794111711e-05, |
|
"loss": 0.1766, |
|
"step": 3104 |
|
}, |
|
{ |
|
"epoch": 5.004830917874396, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 5.7298326818480427e-05, |
|
"loss": 0.1998, |
|
"step": 3108 |
|
}, |
|
{ |
|
"epoch": 5.011272141706924, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 5.695433630947894e-05, |
|
"loss": 0.1838, |
|
"step": 3112 |
|
}, |
|
{ |
|
"epoch": 5.017713365539453, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 5.661113934786321e-05, |
|
"loss": 0.2045, |
|
"step": 3116 |
|
}, |
|
{ |
|
"epoch": 5.024154589371981, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 5.626873886061597e-05, |
|
"loss": 0.1917, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 5.030595813204509, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.592713776792723e-05, |
|
"loss": 0.204, |
|
"step": 3124 |
|
}, |
|
{ |
|
"epoch": 5.037037037037037, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 5.5586338983169076e-05, |
|
"loss": 0.1471, |
|
"step": 3128 |
|
}, |
|
{ |
|
"epoch": 5.043478260869565, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 5.52463454128714e-05, |
|
"loss": 0.1966, |
|
"step": 3132 |
|
}, |
|
{ |
|
"epoch": 5.049919484702094, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 5.490715995669641e-05, |
|
"loss": 0.1782, |
|
"step": 3136 |
|
}, |
|
{ |
|
"epoch": 5.056360708534622, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 5.456878550741453e-05, |
|
"loss": 0.1877, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 5.06280193236715, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 5.423122495087915e-05, |
|
"loss": 0.1643, |
|
"step": 3144 |
|
}, |
|
{ |
|
"epoch": 5.069243156199678, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.3894481166002674e-05, |
|
"loss": 0.1792, |
|
"step": 3148 |
|
}, |
|
{ |
|
"epoch": 5.0756843800322065, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.355855702473125e-05, |
|
"loss": 0.1567, |
|
"step": 3152 |
|
}, |
|
{ |
|
"epoch": 5.082125603864735, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.322345539202086e-05, |
|
"loss": 0.2051, |
|
"step": 3156 |
|
}, |
|
{ |
|
"epoch": 5.088566827697263, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 5.288917912581257e-05, |
|
"loss": 0.1754, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 5.095008051529791, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 5.255573107700832e-05, |
|
"loss": 0.1824, |
|
"step": 3164 |
|
}, |
|
{ |
|
"epoch": 5.101449275362318, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 5.222311408944635e-05, |
|
"loss": 0.2092, |
|
"step": 3168 |
|
}, |
|
{ |
|
"epoch": 5.107890499194847, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 5.189133099987731e-05, |
|
"loss": 0.146, |
|
"step": 3172 |
|
}, |
|
{ |
|
"epoch": 5.114331723027375, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 5.156038463793981e-05, |
|
"loss": 0.1692, |
|
"step": 3176 |
|
}, |
|
{ |
|
"epoch": 5.120772946859903, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 5.123027782613636e-05, |
|
"loss": 0.1877, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 5.127214170692431, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 5.09010133798094e-05, |
|
"loss": 0.154, |
|
"step": 3184 |
|
}, |
|
{ |
|
"epoch": 5.1336553945249594, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.0572594107116974e-05, |
|
"loss": 0.1559, |
|
"step": 3188 |
|
}, |
|
{ |
|
"epoch": 5.140096618357488, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 5.0245022809009155e-05, |
|
"loss": 0.171, |
|
"step": 3192 |
|
}, |
|
{ |
|
"epoch": 5.146537842190016, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 4.991830227920398e-05, |
|
"loss": 0.1774, |
|
"step": 3196 |
|
}, |
|
{ |
|
"epoch": 5.152979066022544, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 4.9592435304163675e-05, |
|
"loss": 0.1813, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.159420289855072, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 4.926742466307069e-05, |
|
"loss": 0.1557, |
|
"step": 3204 |
|
}, |
|
{ |
|
"epoch": 5.1658615136876005, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 4.8943273127804345e-05, |
|
"loss": 0.1574, |
|
"step": 3208 |
|
}, |
|
{ |
|
"epoch": 5.172302737520129, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 4.8619983462916935e-05, |
|
"loss": 0.1548, |
|
"step": 3212 |
|
}, |
|
{ |
|
"epoch": 5.178743961352657, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 4.829755842561025e-05, |
|
"loss": 0.1888, |
|
"step": 3216 |
|
}, |
|
{ |
|
"epoch": 5.185185185185185, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 4.797600076571194e-05, |
|
"loss": 0.2004, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 5.191626409017713, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 4.7655313225652294e-05, |
|
"loss": 0.1587, |
|
"step": 3224 |
|
}, |
|
{ |
|
"epoch": 5.1980676328502415, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.7335498540440606e-05, |
|
"loss": 0.1669, |
|
"step": 3228 |
|
}, |
|
{ |
|
"epoch": 5.20450885668277, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 4.7016559437642084e-05, |
|
"loss": 0.171, |
|
"step": 3232 |
|
}, |
|
{ |
|
"epoch": 5.210950080515298, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 4.6698498637354225e-05, |
|
"loss": 0.1566, |
|
"step": 3236 |
|
}, |
|
{ |
|
"epoch": 5.217391304347826, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 4.6381318852184194e-05, |
|
"loss": 0.1936, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 5.223832528180354, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 4.606502278722503e-05, |
|
"loss": 0.1897, |
|
"step": 3244 |
|
}, |
|
{ |
|
"epoch": 5.230273752012883, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 4.574961314003304e-05, |
|
"loss": 0.1935, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 5.236714975845411, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 4.5435092600604676e-05, |
|
"loss": 0.159, |
|
"step": 3252 |
|
}, |
|
{ |
|
"epoch": 5.243156199677939, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.5121463851353476e-05, |
|
"loss": 0.2065, |
|
"step": 3256 |
|
}, |
|
{ |
|
"epoch": 5.249597423510467, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 4.48087295670874e-05, |
|
"loss": 0.1993, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 5.256038647342995, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.449689241498569e-05, |
|
"loss": 0.1717, |
|
"step": 3264 |
|
}, |
|
{ |
|
"epoch": 5.262479871175524, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 4.41859550545765e-05, |
|
"loss": 0.1907, |
|
"step": 3268 |
|
}, |
|
{ |
|
"epoch": 5.268921095008052, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 4.387592013771396e-05, |
|
"loss": 0.1691, |
|
"step": 3272 |
|
}, |
|
{ |
|
"epoch": 5.27536231884058, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 4.356679030855573e-05, |
|
"loss": 0.198, |
|
"step": 3276 |
|
}, |
|
{ |
|
"epoch": 5.281803542673108, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 4.32585682035402e-05, |
|
"loss": 0.2026, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 5.2882447665056365, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.2951256451364264e-05, |
|
"loss": 0.1975, |
|
"step": 3284 |
|
}, |
|
{ |
|
"epoch": 5.294685990338165, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 4.264485767296081e-05, |
|
"loss": 0.1686, |
|
"step": 3288 |
|
}, |
|
{ |
|
"epoch": 5.301127214170693, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 4.233937448147635e-05, |
|
"loss": 0.1583, |
|
"step": 3292 |
|
}, |
|
{ |
|
"epoch": 5.30756843800322, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.203480948224866e-05, |
|
"loss": 0.1777, |
|
"step": 3296 |
|
}, |
|
{ |
|
"epoch": 5.314009661835748, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 4.173116527278471e-05, |
|
"loss": 0.1616, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.320450885668277, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 4.142844444273845e-05, |
|
"loss": 0.1731, |
|
"step": 3304 |
|
}, |
|
{ |
|
"epoch": 5.326892109500805, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.1126649573888696e-05, |
|
"loss": 0.2219, |
|
"step": 3308 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 4.082578324011716e-05, |
|
"loss": 0.1937, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 5.339774557165861, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 4.052584800738636e-05, |
|
"loss": 0.1891, |
|
"step": 3316 |
|
}, |
|
{ |
|
"epoch": 5.3462157809983895, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 4.0226846433717954e-05, |
|
"loss": 0.1811, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 5.352657004830918, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 3.992878106917079e-05, |
|
"loss": 0.1768, |
|
"step": 3324 |
|
}, |
|
{ |
|
"epoch": 5.359098228663446, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.963165445581922e-05, |
|
"loss": 0.1985, |
|
"step": 3328 |
|
}, |
|
{ |
|
"epoch": 5.365539452495974, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.933546912773119e-05, |
|
"loss": 0.154, |
|
"step": 3332 |
|
}, |
|
{ |
|
"epoch": 5.371980676328502, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 3.904022761094715e-05, |
|
"loss": 0.1788, |
|
"step": 3336 |
|
}, |
|
{ |
|
"epoch": 5.3784219001610305, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 3.874593242345785e-05, |
|
"loss": 0.1964, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 5.384863123993559, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.845258607518344e-05, |
|
"loss": 0.1822, |
|
"step": 3344 |
|
}, |
|
{ |
|
"epoch": 5.391304347826087, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.816019106795157e-05, |
|
"loss": 0.1711, |
|
"step": 3348 |
|
}, |
|
{ |
|
"epoch": 5.397745571658615, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 3.7868749895476624e-05, |
|
"loss": 0.1785, |
|
"step": 3352 |
|
}, |
|
{ |
|
"epoch": 5.404186795491143, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 3.7578265043337834e-05, |
|
"loss": 0.1891, |
|
"step": 3356 |
|
}, |
|
{ |
|
"epoch": 5.4106280193236715, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 3.72887389889586e-05, |
|
"loss": 0.1766, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 5.4170692431562, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 3.700017420158486e-05, |
|
"loss": 0.1733, |
|
"step": 3364 |
|
}, |
|
{ |
|
"epoch": 5.423510466988728, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.671257314226471e-05, |
|
"loss": 0.1895, |
|
"step": 3368 |
|
}, |
|
{ |
|
"epoch": 5.429951690821256, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.642593826382663e-05, |
|
"loss": 0.1867, |
|
"step": 3372 |
|
}, |
|
{ |
|
"epoch": 5.436392914653784, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 3.6140272010859166e-05, |
|
"loss": 0.1946, |
|
"step": 3376 |
|
}, |
|
{ |
|
"epoch": 5.442834138486313, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.585557681968979e-05, |
|
"loss": 0.1684, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 5.449275362318841, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 3.5571855118364236e-05, |
|
"loss": 0.1886, |
|
"step": 3384 |
|
}, |
|
{ |
|
"epoch": 5.455716586151369, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.528910932662577e-05, |
|
"loss": 0.199, |
|
"step": 3388 |
|
}, |
|
{ |
|
"epoch": 5.462157809983897, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 3.5007341855894394e-05, |
|
"loss": 0.1877, |
|
"step": 3392 |
|
}, |
|
{ |
|
"epoch": 5.468599033816425, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 3.472655510924656e-05, |
|
"loss": 0.1856, |
|
"step": 3396 |
|
}, |
|
{ |
|
"epoch": 5.475040257648954, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 3.4446751481394516e-05, |
|
"loss": 0.1818, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.481481481481482, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.4167933358665936e-05, |
|
"loss": 0.2009, |
|
"step": 3404 |
|
}, |
|
{ |
|
"epoch": 5.48792270531401, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.3890103118983366e-05, |
|
"loss": 0.1824, |
|
"step": 3408 |
|
}, |
|
{ |
|
"epoch": 5.494363929146537, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.3613263131844294e-05, |
|
"loss": 0.1746, |
|
"step": 3412 |
|
}, |
|
{ |
|
"epoch": 5.500805152979066, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 3.333741575830069e-05, |
|
"loss": 0.1769, |
|
"step": 3416 |
|
}, |
|
{ |
|
"epoch": 5.507246376811594, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 3.306256335093898e-05, |
|
"loss": 0.1822, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 5.513687600644122, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.278870825385983e-05, |
|
"loss": 0.1925, |
|
"step": 3424 |
|
}, |
|
{ |
|
"epoch": 5.52012882447665, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.251585280265839e-05, |
|
"loss": 0.1923, |
|
"step": 3428 |
|
}, |
|
{ |
|
"epoch": 5.526570048309178, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 3.224399932440419e-05, |
|
"loss": 0.1815, |
|
"step": 3432 |
|
}, |
|
{ |
|
"epoch": 5.533011272141707, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 3.1973150137621364e-05, |
|
"loss": 0.1738, |
|
"step": 3436 |
|
}, |
|
{ |
|
"epoch": 5.539452495974235, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 3.170330755226893e-05, |
|
"loss": 0.191, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 5.545893719806763, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 3.1434473869720804e-05, |
|
"loss": 0.1538, |
|
"step": 3444 |
|
}, |
|
{ |
|
"epoch": 5.552334943639291, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.116665138274676e-05, |
|
"loss": 0.1748, |
|
"step": 3448 |
|
}, |
|
{ |
|
"epoch": 5.5587761674718195, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 3.0899842375492145e-05, |
|
"loss": 0.1893, |
|
"step": 3452 |
|
}, |
|
{ |
|
"epoch": 5.565217391304348, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.063404912345897e-05, |
|
"loss": 0.1727, |
|
"step": 3456 |
|
}, |
|
{ |
|
"epoch": 5.571658615136876, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 3.036927389348625e-05, |
|
"loss": 0.1804, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 5.578099838969404, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 3.010551894373075e-05, |
|
"loss": 0.1778, |
|
"step": 3464 |
|
}, |
|
{ |
|
"epoch": 5.584541062801932, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.9842786523647582e-05, |
|
"loss": 0.1679, |
|
"step": 3468 |
|
}, |
|
{ |
|
"epoch": 5.5909822866344605, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 2.9581078873971248e-05, |
|
"loss": 0.1812, |
|
"step": 3472 |
|
}, |
|
{ |
|
"epoch": 5.597423510466989, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.9320398226696367e-05, |
|
"loss": 0.188, |
|
"step": 3476 |
|
}, |
|
{ |
|
"epoch": 5.603864734299517, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.9060746805058738e-05, |
|
"loss": 0.1541, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 5.610305958132045, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.8802126823516193e-05, |
|
"loss": 0.1671, |
|
"step": 3484 |
|
}, |
|
{ |
|
"epoch": 5.616747181964573, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 2.8544540487729984e-05, |
|
"loss": 0.1609, |
|
"step": 3488 |
|
}, |
|
{ |
|
"epoch": 5.6231884057971016, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 2.828798999454577e-05, |
|
"loss": 0.1488, |
|
"step": 3492 |
|
}, |
|
{ |
|
"epoch": 5.62962962962963, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 2.8032477531974984e-05, |
|
"loss": 0.2012, |
|
"step": 3496 |
|
}, |
|
{ |
|
"epoch": 5.636070853462158, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.7778005279176053e-05, |
|
"loss": 0.208, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.642512077294686, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.7524575406435955e-05, |
|
"loss": 0.192, |
|
"step": 3504 |
|
}, |
|
{ |
|
"epoch": 5.648953301127214, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 2.7272190075151655e-05, |
|
"loss": 0.1582, |
|
"step": 3508 |
|
}, |
|
{ |
|
"epoch": 5.655394524959743, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.7020851437811608e-05, |
|
"loss": 0.1762, |
|
"step": 3512 |
|
}, |
|
{ |
|
"epoch": 5.661835748792271, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.6770561637977556e-05, |
|
"loss": 0.1678, |
|
"step": 3516 |
|
}, |
|
{ |
|
"epoch": 5.668276972624799, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 2.652132281026598e-05, |
|
"loss": 0.1822, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 5.674718196457327, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.6273137080330225e-05, |
|
"loss": 0.183, |
|
"step": 3524 |
|
}, |
|
{ |
|
"epoch": 5.681159420289855, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.6026006564842106e-05, |
|
"loss": 0.2009, |
|
"step": 3528 |
|
}, |
|
{ |
|
"epoch": 5.687600644122384, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.577993337147406e-05, |
|
"loss": 0.1858, |
|
"step": 3532 |
|
}, |
|
{ |
|
"epoch": 5.694041867954912, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 2.5534919598880887e-05, |
|
"loss": 0.203, |
|
"step": 3536 |
|
}, |
|
{ |
|
"epoch": 5.70048309178744, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 2.5290967336682266e-05, |
|
"loss": 0.1588, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 5.706924315619968, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.5048078665444497e-05, |
|
"loss": 0.1622, |
|
"step": 3544 |
|
}, |
|
{ |
|
"epoch": 5.713365539452496, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.4806255656663092e-05, |
|
"loss": 0.185, |
|
"step": 3548 |
|
}, |
|
{ |
|
"epoch": 5.719806763285024, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 2.4565500372744845e-05, |
|
"loss": 0.1904, |
|
"step": 3552 |
|
}, |
|
{ |
|
"epoch": 5.726247987117552, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 2.4325814866990583e-05, |
|
"loss": 0.175, |
|
"step": 3556 |
|
}, |
|
{ |
|
"epoch": 5.73268921095008, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.4087201183577205e-05, |
|
"loss": 0.1699, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 5.739130434782608, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 2.384966135754063e-05, |
|
"loss": 0.1823, |
|
"step": 3564 |
|
}, |
|
{ |
|
"epoch": 5.745571658615137, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.3613197414758273e-05, |
|
"loss": 0.1788, |
|
"step": 3568 |
|
}, |
|
{ |
|
"epoch": 5.752012882447665, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.3377811371931793e-05, |
|
"loss": 0.1794, |
|
"step": 3572 |
|
}, |
|
{ |
|
"epoch": 5.758454106280193, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.3143505236569915e-05, |
|
"loss": 0.1684, |
|
"step": 3576 |
|
}, |
|
{ |
|
"epoch": 5.764895330112721, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 2.2910281006971164e-05, |
|
"loss": 0.157, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 5.7713365539452495, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.26781406722071e-05, |
|
"loss": 0.1833, |
|
"step": 3584 |
|
}, |
|
{ |
|
"epoch": 5.777777777777778, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 2.2447086212105143e-05, |
|
"loss": 0.1945, |
|
"step": 3588 |
|
}, |
|
{ |
|
"epoch": 5.784219001610306, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.2217119597231747e-05, |
|
"loss": 0.1801, |
|
"step": 3592 |
|
}, |
|
{ |
|
"epoch": 5.790660225442834, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.1988242788875532e-05, |
|
"loss": 0.1735, |
|
"step": 3596 |
|
}, |
|
{ |
|
"epoch": 5.797101449275362, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 2.1760457739030695e-05, |
|
"loss": 0.1755, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 5.8035426731078905, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 2.1533766390380254e-05, |
|
"loss": 0.1674, |
|
"step": 3604 |
|
}, |
|
{ |
|
"epoch": 5.809983896940419, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 2.1308170676279547e-05, |
|
"loss": 0.138, |
|
"step": 3608 |
|
}, |
|
{ |
|
"epoch": 5.816425120772947, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 2.108367252073961e-05, |
|
"loss": 0.1521, |
|
"step": 3612 |
|
}, |
|
{ |
|
"epoch": 5.822866344605475, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.0860273838410928e-05, |
|
"loss": 0.1771, |
|
"step": 3616 |
|
}, |
|
{ |
|
"epoch": 5.829307568438003, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.0637976534567046e-05, |
|
"loss": 0.1628, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 5.835748792270532, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 2.0416782505088347e-05, |
|
"loss": 0.1631, |
|
"step": 3624 |
|
}, |
|
{ |
|
"epoch": 5.84219001610306, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 2.0196693636445727e-05, |
|
"loss": 0.1731, |
|
"step": 3628 |
|
}, |
|
{ |
|
"epoch": 5.848631239935588, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.9977711805684706e-05, |
|
"loss": 0.1748, |
|
"step": 3632 |
|
}, |
|
{ |
|
"epoch": 5.855072463768116, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.975983888040945e-05, |
|
"loss": 0.1872, |
|
"step": 3636 |
|
}, |
|
{ |
|
"epoch": 5.861513687600644, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 1.9543076718766538e-05, |
|
"loss": 0.1883, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 5.867954911433173, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.932742716942946e-05, |
|
"loss": 0.1543, |
|
"step": 3644 |
|
}, |
|
{ |
|
"epoch": 5.874396135265701, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.911289207158254e-05, |
|
"loss": 0.1807, |
|
"step": 3648 |
|
}, |
|
{ |
|
"epoch": 5.880837359098229, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.8899473254905672e-05, |
|
"loss": 0.1775, |
|
"step": 3652 |
|
}, |
|
{ |
|
"epoch": 5.887278582930757, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.8687172539558208e-05, |
|
"loss": 0.1767, |
|
"step": 3656 |
|
}, |
|
{ |
|
"epoch": 5.8937198067632846, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.8475991736163835e-05, |
|
"loss": 0.1662, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 5.900161030595813, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 1.8265932645794827e-05, |
|
"loss": 0.1575, |
|
"step": 3664 |
|
}, |
|
{ |
|
"epoch": 5.906602254428341, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.805699705995708e-05, |
|
"loss": 0.1778, |
|
"step": 3668 |
|
}, |
|
{ |
|
"epoch": 5.913043478260869, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.7849186760574346e-05, |
|
"loss": 0.1661, |
|
"step": 3672 |
|
}, |
|
{ |
|
"epoch": 5.919484702093397, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 1.7642503519973432e-05, |
|
"loss": 0.1603, |
|
"step": 3676 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.7436949100868864e-05, |
|
"loss": 0.1603, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 5.932367149758454, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 1.7232525256348013e-05, |
|
"loss": 0.1907, |
|
"step": 3684 |
|
}, |
|
{ |
|
"epoch": 5.938808373590982, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.7029233729855883e-05, |
|
"loss": 0.1848, |
|
"step": 3688 |
|
}, |
|
{ |
|
"epoch": 5.94524959742351, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.6827076255180593e-05, |
|
"loss": 0.1719, |
|
"step": 3692 |
|
}, |
|
{ |
|
"epoch": 5.951690821256038, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.6626054556438322e-05, |
|
"loss": 0.1819, |
|
"step": 3696 |
|
}, |
|
{ |
|
"epoch": 5.958132045088567, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 1.6426170348058703e-05, |
|
"loss": 0.1669, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 5.964573268921095, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 1.6227425334770245e-05, |
|
"loss": 0.169, |
|
"step": 3704 |
|
}, |
|
{ |
|
"epoch": 5.971014492753623, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.6029821211585592e-05, |
|
"loss": 0.1723, |
|
"step": 3708 |
|
}, |
|
{ |
|
"epoch": 5.977455716586151, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.5833359663787392e-05, |
|
"loss": 0.2008, |
|
"step": 3712 |
|
}, |
|
{ |
|
"epoch": 5.9838969404186795, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 1.563804236691364e-05, |
|
"loss": 0.1523, |
|
"step": 3716 |
|
}, |
|
{ |
|
"epoch": 5.990338164251208, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.5443870986743562e-05, |
|
"loss": 0.1592, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 5.996779388083736, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.5250847179283243e-05, |
|
"loss": 0.2154, |
|
"step": 3724 |
|
}, |
|
{ |
|
"epoch": 6.003220611916264, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.505897259075171e-05, |
|
"loss": 0.1917, |
|
"step": 3728 |
|
}, |
|
{ |
|
"epoch": 6.009661835748792, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 1.4868248857566734e-05, |
|
"loss": 0.1512, |
|
"step": 3732 |
|
}, |
|
{ |
|
"epoch": 6.0161030595813205, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.4678677606330964e-05, |
|
"loss": 0.1889, |
|
"step": 3736 |
|
}, |
|
{ |
|
"epoch": 6.022544283413849, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 1.4490260453817898e-05, |
|
"loss": 0.1694, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 6.028985507246377, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 1.4302999006958342e-05, |
|
"loss": 0.1365, |
|
"step": 3744 |
|
}, |
|
{ |
|
"epoch": 6.035426731078905, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 1.411689486282654e-05, |
|
"loss": 0.148, |
|
"step": 3748 |
|
}, |
|
{ |
|
"epoch": 6.041867954911433, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.393194960862657e-05, |
|
"loss": 0.1744, |
|
"step": 3752 |
|
}, |
|
{ |
|
"epoch": 6.048309178743962, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.3748164821678759e-05, |
|
"loss": 0.1642, |
|
"step": 3756 |
|
}, |
|
{ |
|
"epoch": 6.05475040257649, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.3565542069406433e-05, |
|
"loss": 0.1826, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 6.061191626409018, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.3384082909322375e-05, |
|
"loss": 0.1911, |
|
"step": 3764 |
|
}, |
|
{ |
|
"epoch": 6.067632850241546, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 1.320378888901546e-05, |
|
"loss": 0.134, |
|
"step": 3768 |
|
}, |
|
{ |
|
"epoch": 6.074074074074074, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 1.3024661546137694e-05, |
|
"loss": 0.1778, |
|
"step": 3772 |
|
}, |
|
{ |
|
"epoch": 6.080515297906603, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.2846702408390975e-05, |
|
"loss": 0.1542, |
|
"step": 3776 |
|
}, |
|
{ |
|
"epoch": 6.086956521739131, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 1.2669912993514036e-05, |
|
"loss": 0.185, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 6.093397745571659, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 1.2494294809269512e-05, |
|
"loss": 0.1937, |
|
"step": 3784 |
|
}, |
|
{ |
|
"epoch": 6.099838969404187, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.2319849353431154e-05, |
|
"loss": 0.1642, |
|
"step": 3788 |
|
}, |
|
{ |
|
"epoch": 6.106280193236715, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.2146578113771005e-05, |
|
"loss": 0.156, |
|
"step": 3792 |
|
}, |
|
{ |
|
"epoch": 6.112721417069243, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.1974482568046694e-05, |
|
"loss": 0.172, |
|
"step": 3796 |
|
}, |
|
{ |
|
"epoch": 6.119162640901771, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.1803564183988812e-05, |
|
"loss": 0.1655, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.125603864734299, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 1.1633824419288474e-05, |
|
"loss": 0.1741, |
|
"step": 3804 |
|
}, |
|
{ |
|
"epoch": 6.132045088566827, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 1.146526472158487e-05, |
|
"loss": 0.1805, |
|
"step": 3808 |
|
}, |
|
{ |
|
"epoch": 6.138486312399356, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.1297886528452882e-05, |
|
"loss": 0.1617, |
|
"step": 3812 |
|
}, |
|
{ |
|
"epoch": 6.144927536231884, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 1.1131691267390757e-05, |
|
"loss": 0.1863, |
|
"step": 3816 |
|
}, |
|
{ |
|
"epoch": 6.151368760064412, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.0966680355808122e-05, |
|
"loss": 0.2013, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 6.15780998389694, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 1.080285520101371e-05, |
|
"loss": 0.1683, |
|
"step": 3824 |
|
}, |
|
{ |
|
"epoch": 6.164251207729468, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 1.0640217200203466e-05, |
|
"loss": 0.1729, |
|
"step": 3828 |
|
}, |
|
{ |
|
"epoch": 6.170692431561997, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.047876774044863e-05, |
|
"loss": 0.1736, |
|
"step": 3832 |
|
}, |
|
{ |
|
"epoch": 6.177133655394525, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.0318508198683734e-05, |
|
"loss": 0.1757, |
|
"step": 3836 |
|
}, |
|
{ |
|
"epoch": 6.183574879227053, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 1.015943994169523e-05, |
|
"loss": 0.1824, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 6.190016103059581, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 1.0001564326109363e-05, |
|
"loss": 0.1536, |
|
"step": 3844 |
|
}, |
|
{ |
|
"epoch": 6.1964573268921095, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 9.844882698381013e-06, |
|
"loss": 0.2085, |
|
"step": 3848 |
|
}, |
|
{ |
|
"epoch": 6.202898550724638, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.689396394781923e-06, |
|
"loss": 0.1665, |
|
"step": 3852 |
|
}, |
|
{ |
|
"epoch": 6.209339774557166, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 9.535106741389542e-06, |
|
"loss": 0.1714, |
|
"step": 3856 |
|
}, |
|
{ |
|
"epoch": 6.215780998389694, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 9.382015054075465e-06, |
|
"loss": 0.1639, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 6.222222222222222, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.230122638494408e-06, |
|
"loss": 0.1391, |
|
"step": 3864 |
|
}, |
|
{ |
|
"epoch": 6.2286634460547505, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 9.079430790072972e-06, |
|
"loss": 0.1514, |
|
"step": 3868 |
|
}, |
|
{ |
|
"epoch": 6.235104669887279, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 8.92994079399868e-06, |
|
"loss": 0.1324, |
|
"step": 3872 |
|
}, |
|
{ |
|
"epoch": 6.241545893719807, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 8.781653925208887e-06, |
|
"loss": 0.1629, |
|
"step": 3876 |
|
}, |
|
{ |
|
"epoch": 6.247987117552335, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 8.634571448380056e-06, |
|
"loss": 0.1899, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 6.254428341384863, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 8.488694617916785e-06, |
|
"loss": 0.1696, |
|
"step": 3884 |
|
}, |
|
{ |
|
"epoch": 6.260869565217392, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 8.344024677941346e-06, |
|
"loss": 0.1652, |
|
"step": 3888 |
|
}, |
|
{ |
|
"epoch": 6.26731078904992, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 8.200562862282912e-06, |
|
"loss": 0.1642, |
|
"step": 3892 |
|
}, |
|
{ |
|
"epoch": 6.273752012882448, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 8.058310394466994e-06, |
|
"loss": 0.1458, |
|
"step": 3896 |
|
}, |
|
{ |
|
"epoch": 6.280193236714976, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 7.917268487705175e-06, |
|
"loss": 0.1519, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 6.286634460547504, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 7.777438344884645e-06, |
|
"loss": 0.1745, |
|
"step": 3904 |
|
}, |
|
{ |
|
"epoch": 6.293075684380033, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 7.638821158557962e-06, |
|
"loss": 0.1696, |
|
"step": 3908 |
|
}, |
|
{ |
|
"epoch": 6.29951690821256, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 7.501418110932872e-06, |
|
"loss": 0.1634, |
|
"step": 3912 |
|
}, |
|
{ |
|
"epoch": 6.305958132045088, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 7.365230373862274e-06, |
|
"loss": 0.1589, |
|
"step": 3916 |
|
}, |
|
{ |
|
"epoch": 6.312399355877616, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 7.2302591088341576e-06, |
|
"loss": 0.1675, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 6.318840579710145, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 7.096505466961794e-06, |
|
"loss": 0.1718, |
|
"step": 3924 |
|
}, |
|
{ |
|
"epoch": 6.325281803542673, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 6.963970588973761e-06, |
|
"loss": 0.1412, |
|
"step": 3928 |
|
}, |
|
{ |
|
"epoch": 6.331723027375201, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 6.832655605204401e-06, |
|
"loss": 0.165, |
|
"step": 3932 |
|
}, |
|
{ |
|
"epoch": 6.338164251207729, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 6.702561635584047e-06, |
|
"loss": 0.1496, |
|
"step": 3936 |
|
}, |
|
{ |
|
"epoch": 6.344605475040257, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 6.57368978962956e-06, |
|
"loss": 0.1829, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 6.351046698872786, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 6.44604116643474e-06, |
|
"loss": 0.182, |
|
"step": 3944 |
|
}, |
|
{ |
|
"epoch": 6.357487922705314, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 6.3196168546610634e-06, |
|
"loss": 0.1714, |
|
"step": 3948 |
|
}, |
|
{ |
|
"epoch": 6.363929146537842, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 6.194417932528478e-06, |
|
"loss": 0.2102, |
|
"step": 3952 |
|
}, |
|
{ |
|
"epoch": 6.37037037037037, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 6.070445467805923e-06, |
|
"loss": 0.1761, |
|
"step": 3956 |
|
}, |
|
{ |
|
"epoch": 6.3768115942028984, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 5.947700517802523e-06, |
|
"loss": 0.1816, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 6.383252818035427, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 5.826184129358358e-06, |
|
"loss": 0.2008, |
|
"step": 3964 |
|
}, |
|
{ |
|
"epoch": 6.389694041867955, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 5.705897338835724e-06, |
|
"loss": 0.1857, |
|
"step": 3968 |
|
}, |
|
{ |
|
"epoch": 6.396135265700483, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 5.58684117211009e-06, |
|
"loss": 0.1681, |
|
"step": 3972 |
|
}, |
|
{ |
|
"epoch": 6.402576489533011, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.469016644561519e-06, |
|
"loss": 0.1409, |
|
"step": 3976 |
|
}, |
|
{ |
|
"epoch": 6.4090177133655395, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 5.352424761065926e-06, |
|
"loss": 0.1647, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 6.415458937198068, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.2370665159865045e-06, |
|
"loss": 0.1556, |
|
"step": 3984 |
|
}, |
|
{ |
|
"epoch": 6.421900161030596, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 5.1229428931652775e-06, |
|
"loss": 0.1912, |
|
"step": 3988 |
|
}, |
|
{ |
|
"epoch": 6.428341384863124, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 5.010054865914676e-06, |
|
"loss": 0.1521, |
|
"step": 3992 |
|
}, |
|
{ |
|
"epoch": 6.434782608695652, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 4.898403397009293e-06, |
|
"loss": 0.192, |
|
"step": 3996 |
|
}, |
|
{ |
|
"epoch": 6.4412238325281805, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 4.787989438677625e-06, |
|
"loss": 0.1464, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.447665056360709, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 4.678813932593911e-06, |
|
"loss": 0.1608, |
|
"step": 4004 |
|
}, |
|
{ |
|
"epoch": 6.454106280193237, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 4.570877809870188e-06, |
|
"loss": 0.1778, |
|
"step": 4008 |
|
}, |
|
{ |
|
"epoch": 6.460547504025765, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 4.464181991048349e-06, |
|
"loss": 0.1657, |
|
"step": 4012 |
|
}, |
|
{ |
|
"epoch": 6.466988727858293, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 4.358727386092198e-06, |
|
"loss": 0.1707, |
|
"step": 4016 |
|
}, |
|
{ |
|
"epoch": 6.473429951690822, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 4.254514894379774e-06, |
|
"loss": 0.1641, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 6.47987117552335, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 4.1515454046956384e-06, |
|
"loss": 0.1767, |
|
"step": 4024 |
|
}, |
|
{ |
|
"epoch": 6.486312399355878, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 4.049819795223336e-06, |
|
"loss": 0.1914, |
|
"step": 4028 |
|
}, |
|
{ |
|
"epoch": 6.492753623188406, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 3.949338933537843e-06, |
|
"loss": 0.1588, |
|
"step": 4032 |
|
}, |
|
{ |
|
"epoch": 6.499194847020934, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.850103676598265e-06, |
|
"loss": 0.1517, |
|
"step": 4036 |
|
}, |
|
{ |
|
"epoch": 6.505636070853463, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 3.752114870740386e-06, |
|
"loss": 0.1713, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 6.512077294685991, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 3.6553733516695937e-06, |
|
"loss": 0.1704, |
|
"step": 4044 |
|
}, |
|
{ |
|
"epoch": 6.518518518518518, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 3.5598799444536697e-06, |
|
"loss": 0.1545, |
|
"step": 4048 |
|
}, |
|
{ |
|
"epoch": 6.524959742351046, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 3.465635463515792e-06, |
|
"loss": 0.1684, |
|
"step": 4052 |
|
}, |
|
{ |
|
"epoch": 6.531400966183575, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 3.3726407126275112e-06, |
|
"loss": 0.1472, |
|
"step": 4056 |
|
}, |
|
{ |
|
"epoch": 6.537842190016103, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 3.2808964849020513e-06, |
|
"loss": 0.1268, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 6.544283413848631, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 3.190403562787369e-06, |
|
"loss": 0.1694, |
|
"step": 4064 |
|
}, |
|
{ |
|
"epoch": 6.550724637681159, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 3.1011627180596075e-06, |
|
"loss": 0.1688, |
|
"step": 4068 |
|
}, |
|
{ |
|
"epoch": 6.557165861513687, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.0131747118164018e-06, |
|
"loss": 0.1946, |
|
"step": 4072 |
|
}, |
|
{ |
|
"epoch": 6.563607085346216, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.9264402944705665e-06, |
|
"loss": 0.1714, |
|
"step": 4076 |
|
}, |
|
{ |
|
"epoch": 6.570048309178744, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 2.8409602057434865e-06, |
|
"loss": 0.1467, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 6.576489533011272, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 2.7567351746589363e-06, |
|
"loss": 0.1615, |
|
"step": 4084 |
|
}, |
|
{ |
|
"epoch": 6.5829307568438, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 2.6737659195368354e-06, |
|
"loss": 0.1674, |
|
"step": 4088 |
|
}, |
|
{ |
|
"epoch": 6.5893719806763285, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 2.592053147987105e-06, |
|
"loss": 0.1651, |
|
"step": 4092 |
|
}, |
|
{ |
|
"epoch": 6.595813204508857, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 2.5115975569036718e-06, |
|
"loss": 0.1263, |
|
"step": 4096 |
|
}, |
|
{ |
|
"epoch": 6.602254428341385, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 2.4323998324584536e-06, |
|
"loss": 0.148, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 6.608695652173913, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 2.354460650095602e-06, |
|
"loss": 0.1537, |
|
"step": 4104 |
|
}, |
|
{ |
|
"epoch": 6.615136876006441, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 2.2777806745256534e-06, |
|
"loss": 0.1553, |
|
"step": 4108 |
|
}, |
|
{ |
|
"epoch": 6.6215780998389695, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 2.202360559719918e-06, |
|
"loss": 0.1763, |
|
"step": 4112 |
|
}, |
|
{ |
|
"epoch": 6.628019323671498, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 2.1282009489048847e-06, |
|
"loss": 0.1717, |
|
"step": 4116 |
|
}, |
|
{ |
|
"epoch": 6.634460547504026, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 2.055302474556708e-06, |
|
"loss": 0.1761, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 6.640901771336554, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.9836657583958806e-06, |
|
"loss": 0.1569, |
|
"step": 4124 |
|
}, |
|
{ |
|
"epoch": 6.647342995169082, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.9132914113818677e-06, |
|
"loss": 0.18, |
|
"step": 4128 |
|
}, |
|
{ |
|
"epoch": 6.6537842190016105, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 1.8441800337078982e-06, |
|
"loss": 0.1563, |
|
"step": 4132 |
|
}, |
|
{ |
|
"epoch": 6.660225442834139, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.7763322147958836e-06, |
|
"loss": 0.1637, |
|
"step": 4136 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.7097485332913885e-06, |
|
"loss": 0.173, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 6.673107890499195, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.6444295570586518e-06, |
|
"loss": 0.1901, |
|
"step": 4144 |
|
}, |
|
{ |
|
"epoch": 6.679549114331723, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.580375843175824e-06, |
|
"loss": 0.1777, |
|
"step": 4148 |
|
}, |
|
{ |
|
"epoch": 6.685990338164252, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 1.5175879379300704e-06, |
|
"loss": 0.1655, |
|
"step": 4152 |
|
}, |
|
{ |
|
"epoch": 6.692431561996779, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 1.4560663768131253e-06, |
|
"loss": 0.1555, |
|
"step": 4156 |
|
}, |
|
{ |
|
"epoch": 6.698872785829307, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 1.3958116845164958e-06, |
|
"loss": 0.1506, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 6.705314009661835, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 1.3368243749271813e-06, |
|
"loss": 0.1987, |
|
"step": 4164 |
|
}, |
|
{ |
|
"epoch": 6.7117552334943635, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.2791049511231277e-06, |
|
"loss": 0.1656, |
|
"step": 4168 |
|
}, |
|
{ |
|
"epoch": 6.718196457326892, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 1.2226539053690975e-06, |
|
"loss": 0.193, |
|
"step": 4172 |
|
}, |
|
{ |
|
"epoch": 6.72463768115942, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.1674717191123228e-06, |
|
"loss": 0.164, |
|
"step": 4176 |
|
}, |
|
{ |
|
"epoch": 6.731078904991948, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.1135588629785252e-06, |
|
"loss": 0.1586, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 6.737520128824476, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.0609157967677695e-06, |
|
"loss": 0.1586, |
|
"step": 4184 |
|
}, |
|
{ |
|
"epoch": 6.743961352657005, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.0095429694506829e-06, |
|
"loss": 0.1947, |
|
"step": 4188 |
|
}, |
|
{ |
|
"epoch": 6.750402576489533, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 9.594408191645254e-07, |
|
"loss": 0.1751, |
|
"step": 4192 |
|
}, |
|
{ |
|
"epoch": 6.756843800322061, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.106097732095085e-07, |
|
"loss": 0.1543, |
|
"step": 4196 |
|
}, |
|
{ |
|
"epoch": 6.763285024154589, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 8.630502480450996e-07, |
|
"loss": 0.195, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 6.769726247987117, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 8.167626492865064e-07, |
|
"loss": 0.1806, |
|
"step": 4204 |
|
}, |
|
{ |
|
"epoch": 6.776167471819646, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 7.717473717012312e-07, |
|
"loss": 0.1726, |
|
"step": 4208 |
|
}, |
|
{ |
|
"epoch": 6.782608695652174, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 7.280047992056725e-07, |
|
"loss": 0.1747, |
|
"step": 4212 |
|
}, |
|
{ |
|
"epoch": 6.789049919484702, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 6.855353048618284e-07, |
|
"loss": 0.1681, |
|
"step": 4216 |
|
}, |
|
{ |
|
"epoch": 6.79549114331723, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 6.443392508742151e-07, |
|
"loss": 0.1712, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 6.8019323671497585, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 6.0441698858667e-07, |
|
"loss": 0.1498, |
|
"step": 4224 |
|
}, |
|
{ |
|
"epoch": 6.808373590982287, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 5.657688584793874e-07, |
|
"loss": 0.1483, |
|
"step": 4228 |
|
}, |
|
{ |
|
"epoch": 6.814814814814815, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 5.283951901660366e-07, |
|
"loss": 0.1557, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 6.821256038647343, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 4.922963023909321e-07, |
|
"loss": 0.1984, |
|
"step": 4236 |
|
}, |
|
{ |
|
"epoch": 6.827697262479871, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 4.574725030263515e-07, |
|
"loss": 0.1322, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 6.8341384863123995, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 4.239240890698381e-07, |
|
"loss": 0.1781, |
|
"step": 4244 |
|
}, |
|
{ |
|
"epoch": 6.840579710144928, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.9165134664170263e-07, |
|
"loss": 0.1824, |
|
"step": 4248 |
|
}, |
|
{ |
|
"epoch": 6.847020933977456, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.6065455098264195e-07, |
|
"loss": 0.1787, |
|
"step": 4252 |
|
}, |
|
{ |
|
"epoch": 6.853462157809984, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 3.309339664513078e-07, |
|
"loss": 0.1622, |
|
"step": 4256 |
|
}, |
|
{ |
|
"epoch": 6.859903381642512, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 3.024898465220582e-07, |
|
"loss": 0.1756, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 6.8663446054750406, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 2.7532243378285966e-07, |
|
"loss": 0.1465, |
|
"step": 4264 |
|
}, |
|
{ |
|
"epoch": 6.872785829307569, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.494319599331718e-07, |
|
"loss": 0.1519, |
|
"step": 4268 |
|
}, |
|
{ |
|
"epoch": 6.879227053140097, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 2.2481864578194898e-07, |
|
"loss": 0.175, |
|
"step": 4272 |
|
}, |
|
{ |
|
"epoch": 6.885668276972625, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 2.0148270124582533e-07, |
|
"loss": 0.161, |
|
"step": 4276 |
|
}, |
|
{ |
|
"epoch": 6.892109500805153, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 1.7942432534728268e-07, |
|
"loss": 0.1582, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 6.898550724637682, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.5864370621293531e-07, |
|
"loss": 0.1852, |
|
"step": 4284 |
|
}, |
|
{ |
|
"epoch": 6.90499194847021, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.3914102107193127e-07, |
|
"loss": 0.1738, |
|
"step": 4288 |
|
}, |
|
{ |
|
"epoch": 6.911433172302738, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 1.2091643625452008e-07, |
|
"loss": 0.149, |
|
"step": 4292 |
|
}, |
|
{ |
|
"epoch": 6.917874396135265, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 1.0397010719050414e-07, |
|
"loss": 0.1963, |
|
"step": 4296 |
|
}, |
|
{ |
|
"epoch": 6.9243156199677935, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 8.830217840800624e-08, |
|
"loss": 0.182, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 6.930756843800322, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 7.391278353223728e-08, |
|
"loss": 0.1708, |
|
"step": 4304 |
|
}, |
|
{ |
|
"epoch": 6.93719806763285, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 6.080204528426391e-08, |
|
"loss": 0.1593, |
|
"step": 4308 |
|
}, |
|
{ |
|
"epoch": 6.943639291465378, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.897007548010923e-08, |
|
"loss": 0.1506, |
|
"step": 4312 |
|
}, |
|
{ |
|
"epoch": 6.950080515297906, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.841697502963703e-08, |
|
"loss": 0.152, |
|
"step": 4316 |
|
}, |
|
{ |
|
"epoch": 6.956521739130435, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.9142833935819065e-08, |
|
"loss": 0.168, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 6.962962962962963, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 2.1147731293919002e-08, |
|
"loss": 0.1745, |
|
"step": 4324 |
|
}, |
|
{ |
|
"epoch": 6.969404186795491, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.4431735290809654e-08, |
|
"loss": 0.1848, |
|
"step": 4328 |
|
}, |
|
{ |
|
"epoch": 6.975845410628019, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 8.994903204390113e-09, |
|
"loss": 0.1714, |
|
"step": 4332 |
|
}, |
|
{ |
|
"epoch": 6.982286634460547, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 4.837281403119453e-09, |
|
"loss": 0.2034, |
|
"step": 4336 |
|
}, |
|
{ |
|
"epoch": 6.988727858293076, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 1.958905345600392e-09, |
|
"loss": 0.1877, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 6.995169082125604, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.597995803128473e-10, |
|
"loss": 0.1574, |
|
"step": 4344 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"step": 4347, |
|
"total_flos": 2.864063510520791e+18, |
|
"train_loss": 0.36376489625186625, |
|
"train_runtime": 17966.6164, |
|
"train_samples_per_second": 7.736, |
|
"train_steps_per_second": 0.242 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 4347, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.864063510520791e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|