{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 4347, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00644122383252818, "grad_norm": 0.5078125, "learning_rate": 2.727272727272727e-05, "loss": 1.3628, "step": 4 }, { "epoch": 0.01288244766505636, "grad_norm": 0.3203125, "learning_rate": 5.454545454545454e-05, "loss": 1.3272, "step": 8 }, { "epoch": 0.01932367149758454, "grad_norm": 0.375, "learning_rate": 8.18181818181818e-05, "loss": 1.2626, "step": 12 }, { "epoch": 0.02576489533011272, "grad_norm": 0.2353515625, "learning_rate": 0.00010909090909090908, "loss": 1.2028, "step": 16 }, { "epoch": 0.0322061191626409, "grad_norm": 0.189453125, "learning_rate": 0.00013636363636363634, "loss": 1.1822, "step": 20 }, { "epoch": 0.03864734299516908, "grad_norm": 0.2060546875, "learning_rate": 0.0001636363636363636, "loss": 1.2029, "step": 24 }, { "epoch": 0.04508856682769726, "grad_norm": 0.236328125, "learning_rate": 0.0001909090909090909, "loss": 1.1609, "step": 28 }, { "epoch": 0.05152979066022544, "grad_norm": 0.255859375, "learning_rate": 0.00021818181818181816, "loss": 1.1137, "step": 32 }, { "epoch": 0.057971014492753624, "grad_norm": 0.2431640625, "learning_rate": 0.00024545454545454545, "loss": 1.085, "step": 36 }, { "epoch": 0.0644122383252818, "grad_norm": 0.23828125, "learning_rate": 0.0002727272727272727, "loss": 1.1052, "step": 40 }, { "epoch": 0.07085346215780998, "grad_norm": 0.240234375, "learning_rate": 0.0003, "loss": 1.0712, "step": 44 }, { "epoch": 0.07729468599033816, "grad_norm": 0.2373046875, "learning_rate": 0.00029999936035650057, "loss": 1.0588, "step": 48 }, { "epoch": 0.08373590982286634, "grad_norm": 0.2373046875, "learning_rate": 0.0002999974414314574, "loss": 1.0531, "step": 52 }, { "epoch": 0.09017713365539452, "grad_norm": 0.2431640625, "learning_rate": 0.00029999424324123633, "loss": 0.9953, "step": 56 }, { "epoch": 0.0966183574879227, "grad_norm": 0.25, "learning_rate": 0.0002999897658131134, "loss": 0.9887, "step": 60 }, { "epoch": 0.10305958132045089, "grad_norm": 0.236328125, "learning_rate": 0.0002999840091852746, "loss": 0.9945, "step": 64 }, { "epoch": 0.10950080515297907, "grad_norm": 0.2451171875, "learning_rate": 0.00029997697340681585, "loss": 0.9306, "step": 68 }, { "epoch": 0.11594202898550725, "grad_norm": 0.25390625, "learning_rate": 0.00029996865853774236, "loss": 0.9458, "step": 72 }, { "epoch": 0.12238325281803543, "grad_norm": 0.25390625, "learning_rate": 0.00029995906464896807, "loss": 0.9487, "step": 76 }, { "epoch": 0.1288244766505636, "grad_norm": 0.26171875, "learning_rate": 0.0002999481918223153, "loss": 0.9144, "step": 80 }, { "epoch": 0.13526570048309178, "grad_norm": 0.2412109375, "learning_rate": 0.0002999360401505139, "loss": 0.9289, "step": 84 }, { "epoch": 0.14170692431561996, "grad_norm": 0.265625, "learning_rate": 0.00029992260973720023, "loss": 0.882, "step": 88 }, { "epoch": 0.14814814814814814, "grad_norm": 0.2451171875, "learning_rate": 0.00029990790069691665, "loss": 0.9031, "step": 92 }, { "epoch": 0.15458937198067632, "grad_norm": 0.255859375, "learning_rate": 0.00029989191315511055, "loss": 0.9127, "step": 96 }, { "epoch": 0.1610305958132045, "grad_norm": 0.341796875, "learning_rate": 0.0002998746472481328, "loss": 0.8803, "step": 100 }, { "epoch": 0.16747181964573268, "grad_norm": 0.263671875, "learning_rate": 0.0002998561031232371, "loss": 0.8764, "step": 104 }, { "epoch": 0.17391304347826086, "grad_norm": 0.263671875, "learning_rate": 0.00029983628093857855, "loss": 0.9189, "step": 108 }, { "epoch": 0.18035426731078905, "grad_norm": 0.302734375, "learning_rate": 0.00029981518086321225, "loss": 0.8169, "step": 112 }, { "epoch": 0.18679549114331723, "grad_norm": 0.275390625, "learning_rate": 0.00029979280307709176, "loss": 0.8672, "step": 116 }, { "epoch": 0.1932367149758454, "grad_norm": 0.294921875, "learning_rate": 0.0002997691477710679, "loss": 0.8387, "step": 120 }, { "epoch": 0.1996779388083736, "grad_norm": 0.279296875, "learning_rate": 0.0002997442151468869, "loss": 0.8039, "step": 124 }, { "epoch": 0.20611916264090177, "grad_norm": 0.267578125, "learning_rate": 0.00029971800541718854, "loss": 0.8294, "step": 128 }, { "epoch": 0.21256038647342995, "grad_norm": 0.291015625, "learning_rate": 0.0002996905188055046, "loss": 0.8228, "step": 132 }, { "epoch": 0.21900161030595813, "grad_norm": 0.267578125, "learning_rate": 0.00029966175554625696, "loss": 0.8341, "step": 136 }, { "epoch": 0.22544283413848631, "grad_norm": 0.259765625, "learning_rate": 0.00029963171588475525, "loss": 0.8095, "step": 140 }, { "epoch": 0.2318840579710145, "grad_norm": 0.294921875, "learning_rate": 0.0002996004000771952, "loss": 0.8285, "step": 144 }, { "epoch": 0.23832528180354268, "grad_norm": 0.265625, "learning_rate": 0.00029956780839065616, "loss": 0.8123, "step": 148 }, { "epoch": 0.24476650563607086, "grad_norm": 0.248046875, "learning_rate": 0.00029953394110309887, "loss": 0.7612, "step": 152 }, { "epoch": 0.25120772946859904, "grad_norm": 0.30078125, "learning_rate": 0.0002994987985033633, "loss": 0.7723, "step": 156 }, { "epoch": 0.2576489533011272, "grad_norm": 0.27734375, "learning_rate": 0.0002994623808911659, "loss": 0.8202, "step": 160 }, { "epoch": 0.2640901771336554, "grad_norm": 0.326171875, "learning_rate": 0.00029942468857709715, "loss": 0.7324, "step": 164 }, { "epoch": 0.27053140096618356, "grad_norm": 0.255859375, "learning_rate": 0.000299385721882619, "loss": 0.7818, "step": 168 }, { "epoch": 0.27697262479871176, "grad_norm": 0.298828125, "learning_rate": 0.000299345481140062, "loss": 0.7693, "step": 172 }, { "epoch": 0.2834138486312399, "grad_norm": 0.27734375, "learning_rate": 0.00029930396669262255, "loss": 0.7481, "step": 176 }, { "epoch": 0.2898550724637681, "grad_norm": 0.2890625, "learning_rate": 0.00029926117889435993, "loss": 0.7478, "step": 180 }, { "epoch": 0.2962962962962963, "grad_norm": 0.302734375, "learning_rate": 0.00029921711811019334, "loss": 0.7581, "step": 184 }, { "epoch": 0.3027375201288245, "grad_norm": 0.30859375, "learning_rate": 0.00029917178471589864, "loss": 0.7131, "step": 188 }, { "epoch": 0.30917874396135264, "grad_norm": 0.28125, "learning_rate": 0.0002991251790981053, "loss": 0.7121, "step": 192 }, { "epoch": 0.31561996779388085, "grad_norm": 0.28125, "learning_rate": 0.0002990773016542932, "loss": 0.7385, "step": 196 }, { "epoch": 0.322061191626409, "grad_norm": 0.345703125, "learning_rate": 0.00029902815279278874, "loss": 0.743, "step": 200 }, { "epoch": 0.3285024154589372, "grad_norm": 0.28515625, "learning_rate": 0.00029897773293276214, "loss": 0.6984, "step": 204 }, { "epoch": 0.33494363929146537, "grad_norm": 0.2890625, "learning_rate": 0.000298926042504223, "loss": 0.7278, "step": 208 }, { "epoch": 0.3413848631239936, "grad_norm": 0.271484375, "learning_rate": 0.00029887308194801745, "loss": 0.7043, "step": 212 }, { "epoch": 0.34782608695652173, "grad_norm": 0.263671875, "learning_rate": 0.00029881885171582364, "loss": 0.7455, "step": 216 }, { "epoch": 0.35426731078904994, "grad_norm": 0.28125, "learning_rate": 0.0002987633522701486, "loss": 0.7314, "step": 220 }, { "epoch": 0.3607085346215781, "grad_norm": 0.28125, "learning_rate": 0.00029870658408432375, "loss": 0.7344, "step": 224 }, { "epoch": 0.3671497584541063, "grad_norm": 0.2734375, "learning_rate": 0.0002986485476425011, "loss": 0.7324, "step": 228 }, { "epoch": 0.37359098228663445, "grad_norm": 0.28125, "learning_rate": 0.0002985892434396491, "loss": 0.7197, "step": 232 }, { "epoch": 0.38003220611916266, "grad_norm": 0.275390625, "learning_rate": 0.00029852867198154837, "loss": 0.6616, "step": 236 }, { "epoch": 0.3864734299516908, "grad_norm": 0.267578125, "learning_rate": 0.0002984668337847874, "loss": 0.6325, "step": 240 }, { "epoch": 0.392914653784219, "grad_norm": 0.28125, "learning_rate": 0.0002984037293767583, "loss": 0.6445, "step": 244 }, { "epoch": 0.3993558776167472, "grad_norm": 0.2734375, "learning_rate": 0.00029833935929565194, "loss": 0.6846, "step": 248 }, { "epoch": 0.4057971014492754, "grad_norm": 0.26953125, "learning_rate": 0.00029827372409045377, "loss": 0.6976, "step": 252 }, { "epoch": 0.41223832528180354, "grad_norm": 0.306640625, "learning_rate": 0.0002982068243209389, "loss": 0.7165, "step": 256 }, { "epoch": 0.41867954911433175, "grad_norm": 0.275390625, "learning_rate": 0.00029813866055766736, "loss": 0.6647, "step": 260 }, { "epoch": 0.4251207729468599, "grad_norm": 0.283203125, "learning_rate": 0.00029806923338197925, "loss": 0.6809, "step": 264 }, { "epoch": 0.43156199677938806, "grad_norm": 0.267578125, "learning_rate": 0.00029799854338598974, "loss": 0.7285, "step": 268 }, { "epoch": 0.43800322061191627, "grad_norm": 0.32421875, "learning_rate": 0.0002979265911725842, "loss": 0.6978, "step": 272 }, { "epoch": 0.4444444444444444, "grad_norm": 0.267578125, "learning_rate": 0.00029785337735541276, "loss": 0.6598, "step": 276 }, { "epoch": 0.45088566827697263, "grad_norm": 0.279296875, "learning_rate": 0.0002977789025588854, "loss": 0.6534, "step": 280 }, { "epoch": 0.4573268921095008, "grad_norm": 0.2734375, "learning_rate": 0.0002977031674181663, "loss": 0.7261, "step": 284 }, { "epoch": 0.463768115942029, "grad_norm": 0.27734375, "learning_rate": 0.00029762617257916873, "loss": 0.6762, "step": 288 }, { "epoch": 0.47020933977455714, "grad_norm": 0.306640625, "learning_rate": 0.0002975479186985493, "loss": 0.6625, "step": 292 }, { "epoch": 0.47665056360708535, "grad_norm": 0.291015625, "learning_rate": 0.0002974684064437025, "loss": 0.6617, "step": 296 }, { "epoch": 0.4830917874396135, "grad_norm": 0.294921875, "learning_rate": 0.00029738763649275496, "loss": 0.6886, "step": 300 }, { "epoch": 0.4895330112721417, "grad_norm": 0.265625, "learning_rate": 0.0002973056095345596, "loss": 0.6623, "step": 304 }, { "epoch": 0.49597423510466987, "grad_norm": 0.298828125, "learning_rate": 0.00029722232626869, "loss": 0.6568, "step": 308 }, { "epoch": 0.5024154589371981, "grad_norm": 0.263671875, "learning_rate": 0.0002971377874054341, "loss": 0.6281, "step": 312 }, { "epoch": 0.5088566827697263, "grad_norm": 0.259765625, "learning_rate": 0.0002970519936657884, "loss": 0.6618, "step": 316 }, { "epoch": 0.5152979066022544, "grad_norm": 0.26171875, "learning_rate": 0.00029696494578145157, "loss": 0.6797, "step": 320 }, { "epoch": 0.5217391304347826, "grad_norm": 0.2890625, "learning_rate": 0.0002968766444948185, "loss": 0.6756, "step": 324 }, { "epoch": 0.5281803542673108, "grad_norm": 0.271484375, "learning_rate": 0.0002967870905589739, "loss": 0.698, "step": 328 }, { "epoch": 0.534621578099839, "grad_norm": 0.28125, "learning_rate": 0.0002966962847376855, "loss": 0.6431, "step": 332 }, { "epoch": 0.5410628019323671, "grad_norm": 0.27734375, "learning_rate": 0.00029660422780539814, "loss": 0.6713, "step": 336 }, { "epoch": 0.5475040257648953, "grad_norm": 0.28125, "learning_rate": 0.00029651092054722665, "loss": 0.615, "step": 340 }, { "epoch": 0.5539452495974235, "grad_norm": 0.275390625, "learning_rate": 0.0002964163637589495, "loss": 0.7173, "step": 344 }, { "epoch": 0.5603864734299517, "grad_norm": 0.2490234375, "learning_rate": 0.0002963205582470017, "loss": 0.6808, "step": 348 }, { "epoch": 0.5668276972624798, "grad_norm": 0.28515625, "learning_rate": 0.00029622350482846844, "loss": 0.6684, "step": 352 }, { "epoch": 0.573268921095008, "grad_norm": 0.251953125, "learning_rate": 0.00029612520433107734, "loss": 0.644, "step": 356 }, { "epoch": 0.5797101449275363, "grad_norm": 0.28125, "learning_rate": 0.0002960256575931922, "loss": 0.6599, "step": 360 }, { "epoch": 0.5861513687600645, "grad_norm": 0.298828125, "learning_rate": 0.0002959248654638053, "loss": 0.7006, "step": 364 }, { "epoch": 0.5925925925925926, "grad_norm": 0.265625, "learning_rate": 0.00029582282880253035, "loss": 0.625, "step": 368 }, { "epoch": 0.5990338164251208, "grad_norm": 0.26953125, "learning_rate": 0.0002957195484795952, "loss": 0.7234, "step": 372 }, { "epoch": 0.605475040257649, "grad_norm": 0.291015625, "learning_rate": 0.0002956150253758344, "loss": 0.6556, "step": 376 }, { "epoch": 0.6119162640901772, "grad_norm": 0.283203125, "learning_rate": 0.00029550926038268146, "loss": 0.6402, "step": 380 }, { "epoch": 0.6183574879227053, "grad_norm": 0.265625, "learning_rate": 0.0002954022544021617, "loss": 0.6446, "step": 384 }, { "epoch": 0.6247987117552335, "grad_norm": 0.296875, "learning_rate": 0.00029529400834688415, "loss": 0.6379, "step": 388 }, { "epoch": 0.6312399355877617, "grad_norm": 0.271484375, "learning_rate": 0.00029518452314003394, "loss": 0.644, "step": 392 }, { "epoch": 0.6376811594202898, "grad_norm": 0.30078125, "learning_rate": 0.0002950737997153645, "loss": 0.6413, "step": 396 }, { "epoch": 0.644122383252818, "grad_norm": 0.267578125, "learning_rate": 0.00029496183901718927, "loss": 0.6249, "step": 400 }, { "epoch": 0.6505636070853462, "grad_norm": 0.291015625, "learning_rate": 0.00029484864200037415, "loss": 0.5929, "step": 404 }, { "epoch": 0.6570048309178744, "grad_norm": 0.263671875, "learning_rate": 0.0002947342096303289, "loss": 0.6447, "step": 408 }, { "epoch": 0.6634460547504025, "grad_norm": 0.287109375, "learning_rate": 0.0002946185428829991, "loss": 0.641, "step": 412 }, { "epoch": 0.6698872785829307, "grad_norm": 0.3125, "learning_rate": 0.0002945016427448579, "loss": 0.6878, "step": 416 }, { "epoch": 0.6763285024154589, "grad_norm": 0.287109375, "learning_rate": 0.0002943835102128975, "loss": 0.6646, "step": 420 }, { "epoch": 0.6827697262479872, "grad_norm": 0.26171875, "learning_rate": 0.0002942641462946206, "loss": 0.613, "step": 424 }, { "epoch": 0.6892109500805152, "grad_norm": 0.302734375, "learning_rate": 0.00029414355200803197, "loss": 0.6135, "step": 428 }, { "epoch": 0.6956521739130435, "grad_norm": 0.283203125, "learning_rate": 0.0002940217283816296, "loss": 0.6145, "step": 432 }, { "epoch": 0.7020933977455717, "grad_norm": 0.27734375, "learning_rate": 0.0002938986764543961, "loss": 0.6199, "step": 436 }, { "epoch": 0.7085346215780999, "grad_norm": 0.267578125, "learning_rate": 0.0002937743972757895, "loss": 0.6566, "step": 440 }, { "epoch": 0.714975845410628, "grad_norm": 0.27734375, "learning_rate": 0.0002936488919057349, "loss": 0.6536, "step": 444 }, { "epoch": 0.7214170692431562, "grad_norm": 0.28515625, "learning_rate": 0.0002935221614146148, "loss": 0.6586, "step": 448 }, { "epoch": 0.7278582930756844, "grad_norm": 0.259765625, "learning_rate": 0.0002933942068832604, "loss": 0.6234, "step": 452 }, { "epoch": 0.7342995169082126, "grad_norm": 0.28515625, "learning_rate": 0.00029326502940294207, "loss": 0.6115, "step": 456 }, { "epoch": 0.7407407407407407, "grad_norm": 0.298828125, "learning_rate": 0.00029313463007536034, "loss": 0.6205, "step": 460 }, { "epoch": 0.7471819645732689, "grad_norm": 0.271484375, "learning_rate": 0.0002930030100126363, "loss": 0.6185, "step": 464 }, { "epoch": 0.7536231884057971, "grad_norm": 0.251953125, "learning_rate": 0.0002928701703373021, "loss": 0.6305, "step": 468 }, { "epoch": 0.7600644122383253, "grad_norm": 0.267578125, "learning_rate": 0.00029273611218229165, "loss": 0.6464, "step": 472 }, { "epoch": 0.7665056360708534, "grad_norm": 0.263671875, "learning_rate": 0.0002926008366909307, "loss": 0.6488, "step": 476 }, { "epoch": 0.7729468599033816, "grad_norm": 0.28515625, "learning_rate": 0.00029246434501692685, "loss": 0.6148, "step": 480 }, { "epoch": 0.7793880837359098, "grad_norm": 0.28125, "learning_rate": 0.00029232663832436047, "loss": 0.5946, "step": 484 }, { "epoch": 0.785829307568438, "grad_norm": 0.265625, "learning_rate": 0.0002921877177876741, "loss": 0.5898, "step": 488 }, { "epoch": 0.7922705314009661, "grad_norm": 0.265625, "learning_rate": 0.0002920475845916626, "loss": 0.6435, "step": 492 }, { "epoch": 0.7987117552334944, "grad_norm": 0.2578125, "learning_rate": 0.00029190623993146313, "loss": 0.6605, "step": 496 }, { "epoch": 0.8051529790660226, "grad_norm": 0.287109375, "learning_rate": 0.0002917636850125449, "loss": 0.6297, "step": 500 }, { "epoch": 0.8115942028985508, "grad_norm": 0.26953125, "learning_rate": 0.00029161992105069905, "loss": 0.6313, "step": 504 }, { "epoch": 0.8180354267310789, "grad_norm": 0.26953125, "learning_rate": 0.0002914749492720279, "loss": 0.5953, "step": 508 }, { "epoch": 0.8244766505636071, "grad_norm": 0.267578125, "learning_rate": 0.00029132877091293493, "loss": 0.6615, "step": 512 }, { "epoch": 0.8309178743961353, "grad_norm": 0.287109375, "learning_rate": 0.000291181387220114, "loss": 0.6771, "step": 516 }, { "epoch": 0.8373590982286635, "grad_norm": 0.283203125, "learning_rate": 0.0002910327994505387, "loss": 0.5889, "step": 520 }, { "epoch": 0.8438003220611916, "grad_norm": 0.275390625, "learning_rate": 0.0002908830088714516, "loss": 0.5781, "step": 524 }, { "epoch": 0.8502415458937198, "grad_norm": 0.287109375, "learning_rate": 0.00029073201676035383, "loss": 0.6182, "step": 528 }, { "epoch": 0.856682769726248, "grad_norm": 0.263671875, "learning_rate": 0.00029057982440499356, "loss": 0.6226, "step": 532 }, { "epoch": 0.8631239935587761, "grad_norm": 0.27734375, "learning_rate": 0.00029042643310335547, "loss": 0.6547, "step": 536 }, { "epoch": 0.8695652173913043, "grad_norm": 0.26953125, "learning_rate": 0.00029027184416364956, "loss": 0.6114, "step": 540 }, { "epoch": 0.8760064412238325, "grad_norm": 0.28515625, "learning_rate": 0.0002901160589043, "loss": 0.6491, "step": 544 }, { "epoch": 0.8824476650563607, "grad_norm": 0.275390625, "learning_rate": 0.00028995907865393385, "loss": 0.6375, "step": 548 }, { "epoch": 0.8888888888888888, "grad_norm": 0.2578125, "learning_rate": 0.00028980090475136963, "loss": 0.6083, "step": 552 }, { "epoch": 0.895330112721417, "grad_norm": 0.267578125, "learning_rate": 0.0002896415385456062, "loss": 0.5879, "step": 556 }, { "epoch": 0.9017713365539453, "grad_norm": 0.279296875, "learning_rate": 0.000289480981395811, "loss": 0.6596, "step": 560 }, { "epoch": 0.9082125603864735, "grad_norm": 0.306640625, "learning_rate": 0.00028931923467130855, "loss": 0.5774, "step": 564 }, { "epoch": 0.9146537842190016, "grad_norm": 0.28515625, "learning_rate": 0.00028915629975156867, "loss": 0.6118, "step": 568 }, { "epoch": 0.9210950080515298, "grad_norm": 0.298828125, "learning_rate": 0.0002889921780261949, "loss": 0.615, "step": 572 }, { "epoch": 0.927536231884058, "grad_norm": 0.25, "learning_rate": 0.00028882687089491234, "loss": 0.6225, "step": 576 }, { "epoch": 0.9339774557165862, "grad_norm": 0.291015625, "learning_rate": 0.0002886603797675563, "loss": 0.5626, "step": 580 }, { "epoch": 0.9404186795491143, "grad_norm": 0.26953125, "learning_rate": 0.0002884927060640596, "loss": 0.5886, "step": 584 }, { "epoch": 0.9468599033816425, "grad_norm": 0.328125, "learning_rate": 0.0002883238512144409, "loss": 0.6251, "step": 588 }, { "epoch": 0.9533011272141707, "grad_norm": 0.27734375, "learning_rate": 0.0002881538166587921, "loss": 0.6326, "step": 592 }, { "epoch": 0.9597423510466989, "grad_norm": 0.271484375, "learning_rate": 0.0002879826038472667, "loss": 0.5666, "step": 596 }, { "epoch": 0.966183574879227, "grad_norm": 0.279296875, "learning_rate": 0.00028781021424006677, "loss": 0.5282, "step": 600 }, { "epoch": 0.9726247987117552, "grad_norm": 0.271484375, "learning_rate": 0.00028763664930743087, "loss": 0.6628, "step": 604 }, { "epoch": 0.9790660225442834, "grad_norm": 0.265625, "learning_rate": 0.00028746191052962146, "loss": 0.5669, "step": 608 }, { "epoch": 0.9855072463768116, "grad_norm": 0.267578125, "learning_rate": 0.00028728599939691215, "loss": 0.5955, "step": 612 }, { "epoch": 0.9919484702093397, "grad_norm": 0.27734375, "learning_rate": 0.00028710891740957507, "loss": 0.5995, "step": 616 }, { "epoch": 0.998389694041868, "grad_norm": 0.265625, "learning_rate": 0.00028693066607786823, "loss": 0.5813, "step": 620 }, { "epoch": 1.0048309178743962, "grad_norm": 0.251953125, "learning_rate": 0.0002867512469220222, "loss": 0.5306, "step": 624 }, { "epoch": 1.0112721417069244, "grad_norm": 0.275390625, "learning_rate": 0.00028657066147222773, "loss": 0.4918, "step": 628 }, { "epoch": 1.0177133655394526, "grad_norm": 0.259765625, "learning_rate": 0.00028638891126862224, "loss": 0.5198, "step": 632 }, { "epoch": 1.0241545893719808, "grad_norm": 0.259765625, "learning_rate": 0.0002862059978612769, "loss": 0.5673, "step": 636 }, { "epoch": 1.0305958132045088, "grad_norm": 0.2734375, "learning_rate": 0.00028602192281018327, "loss": 0.5127, "step": 640 }, { "epoch": 1.037037037037037, "grad_norm": 0.283203125, "learning_rate": 0.0002858366876852403, "loss": 0.5517, "step": 644 }, { "epoch": 1.0434782608695652, "grad_norm": 0.26171875, "learning_rate": 0.0002856502940662403, "loss": 0.5209, "step": 648 }, { "epoch": 1.0499194847020934, "grad_norm": 0.279296875, "learning_rate": 0.00028546274354285646, "loss": 0.5362, "step": 652 }, { "epoch": 1.0563607085346216, "grad_norm": 0.25390625, "learning_rate": 0.00028527403771462826, "loss": 0.5256, "step": 656 }, { "epoch": 1.0628019323671498, "grad_norm": 0.361328125, "learning_rate": 0.00028508417819094844, "loss": 0.5257, "step": 660 }, { "epoch": 1.069243156199678, "grad_norm": 0.2890625, "learning_rate": 0.0002848931665910492, "loss": 0.4971, "step": 664 }, { "epoch": 1.075684380032206, "grad_norm": 0.275390625, "learning_rate": 0.0002847010045439882, "loss": 0.5214, "step": 668 }, { "epoch": 1.0821256038647342, "grad_norm": 0.298828125, "learning_rate": 0.0002845076936886349, "loss": 0.5283, "step": 672 }, { "epoch": 1.0885668276972624, "grad_norm": 0.271484375, "learning_rate": 0.0002843132356736563, "loss": 0.5024, "step": 676 }, { "epoch": 1.0950080515297906, "grad_norm": 0.26171875, "learning_rate": 0.0002841176321575032, "loss": 0.5515, "step": 680 }, { "epoch": 1.1014492753623188, "grad_norm": 0.27734375, "learning_rate": 0.0002839208848083958, "loss": 0.5493, "step": 684 }, { "epoch": 1.107890499194847, "grad_norm": 0.2578125, "learning_rate": 0.0002837229953043096, "loss": 0.4908, "step": 688 }, { "epoch": 1.1143317230273753, "grad_norm": 0.27734375, "learning_rate": 0.0002835239653329611, "loss": 0.5136, "step": 692 }, { "epoch": 1.1207729468599035, "grad_norm": 0.283203125, "learning_rate": 0.0002833237965917934, "loss": 0.5379, "step": 696 }, { "epoch": 1.1272141706924317, "grad_norm": 0.28515625, "learning_rate": 0.0002831224907879614, "loss": 0.5059, "step": 700 }, { "epoch": 1.1336553945249597, "grad_norm": 0.27734375, "learning_rate": 0.00028292004963831796, "loss": 0.5231, "step": 704 }, { "epoch": 1.1400966183574879, "grad_norm": 0.279296875, "learning_rate": 0.00028271647486939855, "loss": 0.5223, "step": 708 }, { "epoch": 1.146537842190016, "grad_norm": 0.27734375, "learning_rate": 0.0002825117682174069, "loss": 0.4907, "step": 712 }, { "epoch": 1.1529790660225443, "grad_norm": 0.267578125, "learning_rate": 0.0002823059314282, "loss": 0.4996, "step": 716 }, { "epoch": 1.1594202898550725, "grad_norm": 0.25, "learning_rate": 0.0002820989662572734, "loss": 0.5084, "step": 720 }, { "epoch": 1.1658615136876007, "grad_norm": 0.271484375, "learning_rate": 0.0002818908744697461, "loss": 0.4909, "step": 724 }, { "epoch": 1.1723027375201287, "grad_norm": 0.265625, "learning_rate": 0.00028168165784034566, "loss": 0.5245, "step": 728 }, { "epoch": 1.178743961352657, "grad_norm": 0.271484375, "learning_rate": 0.00028147131815339267, "loss": 0.5307, "step": 732 }, { "epoch": 1.1851851851851851, "grad_norm": 0.26953125, "learning_rate": 0.00028125985720278614, "loss": 0.5213, "step": 736 }, { "epoch": 1.1916264090177133, "grad_norm": 0.298828125, "learning_rate": 0.0002810472767919876, "loss": 0.5257, "step": 740 }, { "epoch": 1.1980676328502415, "grad_norm": 0.2890625, "learning_rate": 0.0002808335787340061, "loss": 0.4913, "step": 744 }, { "epoch": 1.2045088566827697, "grad_norm": 0.263671875, "learning_rate": 0.00028061876485138264, "loss": 0.5331, "step": 748 }, { "epoch": 1.210950080515298, "grad_norm": 0.283203125, "learning_rate": 0.00028040283697617464, "loss": 0.5055, "step": 752 }, { "epoch": 1.2173913043478262, "grad_norm": 0.263671875, "learning_rate": 0.0002801857969499402, "loss": 0.5318, "step": 756 }, { "epoch": 1.2238325281803544, "grad_norm": 0.29296875, "learning_rate": 0.0002799676466237225, "loss": 0.4991, "step": 760 }, { "epoch": 1.2302737520128824, "grad_norm": 0.27734375, "learning_rate": 0.0002797483878580342, "loss": 0.5059, "step": 764 }, { "epoch": 1.2367149758454106, "grad_norm": 0.28125, "learning_rate": 0.00027952802252284104, "loss": 0.5043, "step": 768 }, { "epoch": 1.2431561996779388, "grad_norm": 0.251953125, "learning_rate": 0.0002793065524975465, "loss": 0.5747, "step": 772 }, { "epoch": 1.249597423510467, "grad_norm": 0.30078125, "learning_rate": 0.0002790839796709755, "loss": 0.5082, "step": 776 }, { "epoch": 1.2560386473429952, "grad_norm": 0.287109375, "learning_rate": 0.00027886030594135805, "loss": 0.5369, "step": 780 }, { "epoch": 1.2624798711755234, "grad_norm": 0.27734375, "learning_rate": 0.0002786355332163135, "loss": 0.5423, "step": 784 }, { "epoch": 1.2689210950080514, "grad_norm": 0.302734375, "learning_rate": 0.000278409663412834, "loss": 0.4882, "step": 788 }, { "epoch": 1.2753623188405796, "grad_norm": 0.26953125, "learning_rate": 0.0002781826984572683, "loss": 0.504, "step": 792 }, { "epoch": 1.2818035426731078, "grad_norm": 0.2734375, "learning_rate": 0.0002779546402853051, "loss": 0.4872, "step": 796 }, { "epoch": 1.288244766505636, "grad_norm": 0.265625, "learning_rate": 0.00027772549084195675, "loss": 0.5348, "step": 800 }, { "epoch": 1.2946859903381642, "grad_norm": 0.29296875, "learning_rate": 0.00027749525208154265, "loss": 0.5718, "step": 804 }, { "epoch": 1.3011272141706924, "grad_norm": 0.294921875, "learning_rate": 0.0002772639259676726, "loss": 0.5393, "step": 808 }, { "epoch": 1.3075684380032206, "grad_norm": 0.2734375, "learning_rate": 0.00027703151447322965, "loss": 0.5421, "step": 812 }, { "epoch": 1.3140096618357489, "grad_norm": 0.275390625, "learning_rate": 0.0002767980195803539, "loss": 0.5555, "step": 816 }, { "epoch": 1.320450885668277, "grad_norm": 0.2890625, "learning_rate": 0.0002765634432804253, "loss": 0.553, "step": 820 }, { "epoch": 1.3268921095008053, "grad_norm": 0.27734375, "learning_rate": 0.00027632778757404655, "loss": 0.5075, "step": 824 }, { "epoch": 1.3333333333333333, "grad_norm": 0.294921875, "learning_rate": 0.0002760910544710261, "loss": 0.4933, "step": 828 }, { "epoch": 1.3397745571658615, "grad_norm": 0.283203125, "learning_rate": 0.00027585324599036133, "loss": 0.5039, "step": 832 }, { "epoch": 1.3462157809983897, "grad_norm": 0.28125, "learning_rate": 0.00027561436416022073, "loss": 0.5175, "step": 836 }, { "epoch": 1.3526570048309179, "grad_norm": 0.28125, "learning_rate": 0.00027537441101792715, "loss": 0.5375, "step": 840 }, { "epoch": 1.359098228663446, "grad_norm": 0.287109375, "learning_rate": 0.0002751333886099402, "loss": 0.5235, "step": 844 }, { "epoch": 1.3655394524959743, "grad_norm": 0.28125, "learning_rate": 0.0002748912989918387, "loss": 0.4882, "step": 848 }, { "epoch": 1.3719806763285023, "grad_norm": 0.287109375, "learning_rate": 0.0002746481442283034, "loss": 0.5032, "step": 852 }, { "epoch": 1.3784219001610305, "grad_norm": 0.279296875, "learning_rate": 0.0002744039263930991, "loss": 0.5052, "step": 856 }, { "epoch": 1.3848631239935587, "grad_norm": 0.265625, "learning_rate": 0.0002741586475690571, "loss": 0.5538, "step": 860 }, { "epoch": 1.391304347826087, "grad_norm": 0.263671875, "learning_rate": 0.0002739123098480576, "loss": 0.5457, "step": 864 }, { "epoch": 1.3977455716586151, "grad_norm": 0.2734375, "learning_rate": 0.00027366491533101147, "loss": 0.5111, "step": 868 }, { "epoch": 1.4041867954911433, "grad_norm": 0.263671875, "learning_rate": 0.0002734164661278426, "loss": 0.4902, "step": 872 }, { "epoch": 1.4106280193236715, "grad_norm": 0.263671875, "learning_rate": 0.00027316696435747, "loss": 0.5504, "step": 876 }, { "epoch": 1.4170692431561998, "grad_norm": 0.271484375, "learning_rate": 0.00027291641214778937, "loss": 0.5234, "step": 880 }, { "epoch": 1.423510466988728, "grad_norm": 0.271484375, "learning_rate": 0.0002726648116356554, "loss": 0.5052, "step": 884 }, { "epoch": 1.4299516908212562, "grad_norm": 0.28515625, "learning_rate": 0.000272412164966863, "loss": 0.5189, "step": 888 }, { "epoch": 1.4363929146537842, "grad_norm": 0.279296875, "learning_rate": 0.00027215847429612965, "loss": 0.4982, "step": 892 }, { "epoch": 1.4428341384863124, "grad_norm": 0.275390625, "learning_rate": 0.0002719037417870765, "loss": 0.4916, "step": 896 }, { "epoch": 1.4492753623188406, "grad_norm": 0.259765625, "learning_rate": 0.00027164796961221015, "loss": 0.5149, "step": 900 }, { "epoch": 1.4557165861513688, "grad_norm": 0.287109375, "learning_rate": 0.0002713911599529039, "loss": 0.5636, "step": 904 }, { "epoch": 1.462157809983897, "grad_norm": 0.275390625, "learning_rate": 0.00027113331499937967, "loss": 0.5191, "step": 908 }, { "epoch": 1.4685990338164252, "grad_norm": 0.265625, "learning_rate": 0.00027087443695068873, "loss": 0.4786, "step": 912 }, { "epoch": 1.4750402576489532, "grad_norm": 0.30078125, "learning_rate": 0.0002706145280146931, "loss": 0.5033, "step": 916 }, { "epoch": 1.4814814814814814, "grad_norm": 0.275390625, "learning_rate": 0.00027035359040804703, "loss": 0.4753, "step": 920 }, { "epoch": 1.4879227053140096, "grad_norm": 0.279296875, "learning_rate": 0.0002700916263561778, "loss": 0.5255, "step": 924 }, { "epoch": 1.4943639291465378, "grad_norm": 0.298828125, "learning_rate": 0.0002698286380932667, "loss": 0.5472, "step": 928 }, { "epoch": 1.500805152979066, "grad_norm": 0.265625, "learning_rate": 0.0002695646278622302, "loss": 0.4944, "step": 932 }, { "epoch": 1.5072463768115942, "grad_norm": 0.26953125, "learning_rate": 0.0002692995979147007, "loss": 0.4677, "step": 936 }, { "epoch": 1.5136876006441224, "grad_norm": 0.28515625, "learning_rate": 0.00026903355051100734, "loss": 0.5152, "step": 940 }, { "epoch": 1.5201288244766507, "grad_norm": 0.279296875, "learning_rate": 0.0002687664879201565, "loss": 0.5287, "step": 944 }, { "epoch": 1.5265700483091789, "grad_norm": 0.2734375, "learning_rate": 0.00026849841241981313, "loss": 0.5185, "step": 948 }, { "epoch": 1.533011272141707, "grad_norm": 0.279296875, "learning_rate": 0.00026822932629628034, "loss": 0.4925, "step": 952 }, { "epoch": 1.539452495974235, "grad_norm": 0.279296875, "learning_rate": 0.0002679592318444808, "loss": 0.4938, "step": 956 }, { "epoch": 1.5458937198067633, "grad_norm": 0.271484375, "learning_rate": 0.0002676881313679366, "loss": 0.4962, "step": 960 }, { "epoch": 1.5523349436392915, "grad_norm": 0.275390625, "learning_rate": 0.0002674160271787498, "loss": 0.4962, "step": 964 }, { "epoch": 1.5587761674718197, "grad_norm": 0.26953125, "learning_rate": 0.0002671429215975828, "loss": 0.5142, "step": 968 }, { "epoch": 1.5652173913043477, "grad_norm": 0.28515625, "learning_rate": 0.00026686881695363833, "loss": 0.5361, "step": 972 }, { "epoch": 1.5716586151368759, "grad_norm": 0.287109375, "learning_rate": 0.0002665937155846399, "loss": 0.519, "step": 976 }, { "epoch": 1.578099838969404, "grad_norm": 0.2734375, "learning_rate": 0.0002663176198368114, "loss": 0.5055, "step": 980 }, { "epoch": 1.5845410628019323, "grad_norm": 0.2578125, "learning_rate": 0.0002660405320648576, "loss": 0.5256, "step": 984 }, { "epoch": 1.5909822866344605, "grad_norm": 0.28125, "learning_rate": 0.0002657624546319437, "loss": 0.5103, "step": 988 }, { "epoch": 1.5974235104669887, "grad_norm": 0.296875, "learning_rate": 0.0002654833899096753, "loss": 0.5249, "step": 992 }, { "epoch": 1.603864734299517, "grad_norm": 0.330078125, "learning_rate": 0.00026520334027807827, "loss": 0.4895, "step": 996 }, { "epoch": 1.6103059581320451, "grad_norm": 0.28125, "learning_rate": 0.0002649223081255782, "loss": 0.5061, "step": 1000 }, { "epoch": 1.6167471819645733, "grad_norm": 0.28515625, "learning_rate": 0.00026464029584898036, "loss": 0.4781, "step": 1004 }, { "epoch": 1.6231884057971016, "grad_norm": 0.275390625, "learning_rate": 0.00026435730585344896, "loss": 0.4885, "step": 1008 }, { "epoch": 1.6296296296296298, "grad_norm": 0.27734375, "learning_rate": 0.0002640733405524869, "loss": 0.5188, "step": 1012 }, { "epoch": 1.636070853462158, "grad_norm": 0.296875, "learning_rate": 0.00026378840236791485, "loss": 0.5386, "step": 1016 }, { "epoch": 1.642512077294686, "grad_norm": 0.2734375, "learning_rate": 0.000263502493729851, "loss": 0.5438, "step": 1020 }, { "epoch": 1.6489533011272142, "grad_norm": 0.279296875, "learning_rate": 0.00026321561707668995, "loss": 0.5121, "step": 1024 }, { "epoch": 1.6553945249597424, "grad_norm": 0.26953125, "learning_rate": 0.0002629277748550823, "loss": 0.4868, "step": 1028 }, { "epoch": 1.6618357487922706, "grad_norm": 0.287109375, "learning_rate": 0.0002626389695199134, "loss": 0.5199, "step": 1032 }, { "epoch": 1.6682769726247986, "grad_norm": 0.283203125, "learning_rate": 0.0002623492035342826, "loss": 0.5424, "step": 1036 }, { "epoch": 1.6747181964573268, "grad_norm": 0.279296875, "learning_rate": 0.00026205847936948244, "loss": 0.4983, "step": 1040 }, { "epoch": 1.681159420289855, "grad_norm": 0.27734375, "learning_rate": 0.00026176679950497706, "loss": 0.5323, "step": 1044 }, { "epoch": 1.6876006441223832, "grad_norm": 0.294921875, "learning_rate": 0.0002614741664283816, "loss": 0.5964, "step": 1048 }, { "epoch": 1.6940418679549114, "grad_norm": 0.287109375, "learning_rate": 0.00026118058263544056, "loss": 0.5227, "step": 1052 }, { "epoch": 1.7004830917874396, "grad_norm": 0.30859375, "learning_rate": 0.00026088605063000696, "loss": 0.464, "step": 1056 }, { "epoch": 1.7069243156199678, "grad_norm": 0.267578125, "learning_rate": 0.0002605905729240205, "loss": 0.4978, "step": 1060 }, { "epoch": 1.713365539452496, "grad_norm": 0.337890625, "learning_rate": 0.00026029415203748633, "loss": 0.4983, "step": 1064 }, { "epoch": 1.7198067632850242, "grad_norm": 0.3125, "learning_rate": 0.0002599967904984539, "loss": 0.5166, "step": 1068 }, { "epoch": 1.7262479871175525, "grad_norm": 0.296875, "learning_rate": 0.00025969849084299466, "loss": 0.5683, "step": 1072 }, { "epoch": 1.7326892109500807, "grad_norm": 0.287109375, "learning_rate": 0.00025939925561518126, "loss": 0.486, "step": 1076 }, { "epoch": 1.7391304347826086, "grad_norm": 0.279296875, "learning_rate": 0.0002590990873670652, "loss": 0.4655, "step": 1080 }, { "epoch": 1.7455716586151369, "grad_norm": 0.26953125, "learning_rate": 0.00025879798865865533, "loss": 0.4689, "step": 1084 }, { "epoch": 1.752012882447665, "grad_norm": 0.283203125, "learning_rate": 0.0002584959620578962, "loss": 0.424, "step": 1088 }, { "epoch": 1.7584541062801933, "grad_norm": 0.279296875, "learning_rate": 0.00025819301014064574, "loss": 0.5134, "step": 1092 }, { "epoch": 1.7648953301127213, "grad_norm": 0.279296875, "learning_rate": 0.0002578891354906537, "loss": 0.4893, "step": 1096 }, { "epoch": 1.7713365539452495, "grad_norm": 0.279296875, "learning_rate": 0.00025758434069953927, "loss": 0.4887, "step": 1100 }, { "epoch": 1.7777777777777777, "grad_norm": 0.283203125, "learning_rate": 0.0002572786283667692, "loss": 0.5153, "step": 1104 }, { "epoch": 1.7842190016103059, "grad_norm": 0.28125, "learning_rate": 0.00025697200109963563, "loss": 0.5056, "step": 1108 }, { "epoch": 1.790660225442834, "grad_norm": 0.275390625, "learning_rate": 0.0002566644615132337, "loss": 0.5319, "step": 1112 }, { "epoch": 1.7971014492753623, "grad_norm": 0.265625, "learning_rate": 0.00025635601223043933, "loss": 0.5182, "step": 1116 }, { "epoch": 1.8035426731078905, "grad_norm": 0.28515625, "learning_rate": 0.000256046655881887, "loss": 0.5028, "step": 1120 }, { "epoch": 1.8099838969404187, "grad_norm": 0.283203125, "learning_rate": 0.000255736395105947, "loss": 0.5006, "step": 1124 }, { "epoch": 1.816425120772947, "grad_norm": 0.279296875, "learning_rate": 0.0002554252325487032, "loss": 0.5234, "step": 1128 }, { "epoch": 1.8228663446054751, "grad_norm": 0.294921875, "learning_rate": 0.0002551131708639303, "loss": 0.5544, "step": 1132 }, { "epoch": 1.8293075684380034, "grad_norm": 0.28125, "learning_rate": 0.00025480021271307156, "loss": 0.4766, "step": 1136 }, { "epoch": 1.8357487922705316, "grad_norm": 0.283203125, "learning_rate": 0.00025448636076521534, "loss": 0.4615, "step": 1140 }, { "epoch": 1.8421900161030595, "grad_norm": 0.27734375, "learning_rate": 0.0002541716176970732, "loss": 0.504, "step": 1144 }, { "epoch": 1.8486312399355878, "grad_norm": 0.294921875, "learning_rate": 0.0002538559861929566, "loss": 0.5873, "step": 1148 }, { "epoch": 1.855072463768116, "grad_norm": 0.275390625, "learning_rate": 0.000253539468944754, "loss": 0.5917, "step": 1152 }, { "epoch": 1.8615136876006442, "grad_norm": 0.275390625, "learning_rate": 0.0002532220686519081, "loss": 0.4924, "step": 1156 }, { "epoch": 1.8679549114331722, "grad_norm": 0.296875, "learning_rate": 0.00025290378802139273, "loss": 0.4582, "step": 1160 }, { "epoch": 1.8743961352657004, "grad_norm": 0.3203125, "learning_rate": 0.0002525846297676896, "loss": 0.5639, "step": 1164 }, { "epoch": 1.8808373590982286, "grad_norm": 0.275390625, "learning_rate": 0.0002522645966127655, "loss": 0.5198, "step": 1168 }, { "epoch": 1.8872785829307568, "grad_norm": 0.26953125, "learning_rate": 0.0002519436912860488, "loss": 0.4766, "step": 1172 }, { "epoch": 1.893719806763285, "grad_norm": 0.28515625, "learning_rate": 0.0002516219165244062, "loss": 0.4583, "step": 1176 }, { "epoch": 1.9001610305958132, "grad_norm": 0.30078125, "learning_rate": 0.0002512992750721195, "loss": 0.549, "step": 1180 }, { "epoch": 1.9066022544283414, "grad_norm": 0.287109375, "learning_rate": 0.0002509757696808622, "loss": 0.4792, "step": 1184 }, { "epoch": 1.9130434782608696, "grad_norm": 0.27734375, "learning_rate": 0.0002506514031096758, "loss": 0.4834, "step": 1188 }, { "epoch": 1.9194847020933978, "grad_norm": 0.279296875, "learning_rate": 0.00025032617812494664, "loss": 0.4969, "step": 1192 }, { "epoch": 1.925925925925926, "grad_norm": 0.29296875, "learning_rate": 0.00025000009750038196, "loss": 0.5553, "step": 1196 }, { "epoch": 1.9323671497584543, "grad_norm": 0.294921875, "learning_rate": 0.00024967316401698647, "loss": 0.536, "step": 1200 }, { "epoch": 1.9388083735909822, "grad_norm": 0.265625, "learning_rate": 0.00024934538046303856, "loss": 0.4848, "step": 1204 }, { "epoch": 1.9452495974235104, "grad_norm": 0.28515625, "learning_rate": 0.0002490167496340664, "loss": 0.4984, "step": 1208 }, { "epoch": 1.9516908212560387, "grad_norm": 0.2734375, "learning_rate": 0.0002486872743328244, "loss": 0.4993, "step": 1212 }, { "epoch": 1.9581320450885669, "grad_norm": 0.296875, "learning_rate": 0.000248356957369269, "loss": 0.5265, "step": 1216 }, { "epoch": 1.9645732689210949, "grad_norm": 0.296875, "learning_rate": 0.0002480258015605349, "loss": 0.5287, "step": 1220 }, { "epoch": 1.971014492753623, "grad_norm": 0.287109375, "learning_rate": 0.0002476938097309108, "loss": 0.5616, "step": 1224 }, { "epoch": 1.9774557165861513, "grad_norm": 0.291015625, "learning_rate": 0.0002473609847118156, "loss": 0.4542, "step": 1228 }, { "epoch": 1.9838969404186795, "grad_norm": 0.26171875, "learning_rate": 0.0002470273293417741, "loss": 0.4813, "step": 1232 }, { "epoch": 1.9903381642512077, "grad_norm": 0.3125, "learning_rate": 0.00024669284646639287, "loss": 0.5336, "step": 1236 }, { "epoch": 1.996779388083736, "grad_norm": 0.279296875, "learning_rate": 0.00024635753893833585, "loss": 0.5528, "step": 1240 }, { "epoch": 2.003220611916264, "grad_norm": 0.2265625, "learning_rate": 0.00024602140961730006, "loss": 0.4706, "step": 1244 }, { "epoch": 2.0096618357487923, "grad_norm": 0.279296875, "learning_rate": 0.00024568446136999134, "loss": 0.4093, "step": 1248 }, { "epoch": 2.0161030595813205, "grad_norm": 0.275390625, "learning_rate": 0.00024534669707009974, "loss": 0.3899, "step": 1252 }, { "epoch": 2.0225442834138487, "grad_norm": 0.26953125, "learning_rate": 0.0002450081195982752, "loss": 0.361, "step": 1256 }, { "epoch": 2.028985507246377, "grad_norm": 0.267578125, "learning_rate": 0.00024466873184210273, "loss": 0.3999, "step": 1260 }, { "epoch": 2.035426731078905, "grad_norm": 0.28125, "learning_rate": 0.00024432853669607786, "loss": 0.3753, "step": 1264 }, { "epoch": 2.0418679549114334, "grad_norm": 0.26953125, "learning_rate": 0.00024398753706158225, "loss": 0.3951, "step": 1268 }, { "epoch": 2.0483091787439616, "grad_norm": 0.291015625, "learning_rate": 0.00024364573584685848, "loss": 0.3791, "step": 1272 }, { "epoch": 2.0547504025764893, "grad_norm": 0.28125, "learning_rate": 0.00024330313596698553, "loss": 0.4148, "step": 1276 }, { "epoch": 2.0611916264090175, "grad_norm": 0.271484375, "learning_rate": 0.00024295974034385396, "loss": 0.3767, "step": 1280 }, { "epoch": 2.0676328502415457, "grad_norm": 0.283203125, "learning_rate": 0.00024261555190614072, "loss": 0.3743, "step": 1284 }, { "epoch": 2.074074074074074, "grad_norm": 0.287109375, "learning_rate": 0.00024227057358928452, "loss": 0.3847, "step": 1288 }, { "epoch": 2.080515297906602, "grad_norm": 0.279296875, "learning_rate": 0.00024192480833546044, "loss": 0.3627, "step": 1292 }, { "epoch": 2.0869565217391304, "grad_norm": 0.275390625, "learning_rate": 0.00024157825909355523, "loss": 0.4324, "step": 1296 }, { "epoch": 2.0933977455716586, "grad_norm": 0.29296875, "learning_rate": 0.0002412309288191417, "loss": 0.4302, "step": 1300 }, { "epoch": 2.099838969404187, "grad_norm": 0.3046875, "learning_rate": 0.00024088282047445396, "loss": 0.3788, "step": 1304 }, { "epoch": 2.106280193236715, "grad_norm": 0.28125, "learning_rate": 0.00024053393702836185, "loss": 0.399, "step": 1308 }, { "epoch": 2.112721417069243, "grad_norm": 0.2890625, "learning_rate": 0.0002401842814563457, "loss": 0.387, "step": 1312 }, { "epoch": 2.1191626409017714, "grad_norm": 0.287109375, "learning_rate": 0.00023983385674047113, "loss": 0.3905, "step": 1316 }, { "epoch": 2.1256038647342996, "grad_norm": 0.298828125, "learning_rate": 0.00023948266586936324, "loss": 0.3715, "step": 1320 }, { "epoch": 2.132045088566828, "grad_norm": 0.314453125, "learning_rate": 0.00023913071183818155, "loss": 0.4474, "step": 1324 }, { "epoch": 2.138486312399356, "grad_norm": 0.291015625, "learning_rate": 0.00023877799764859416, "loss": 0.3759, "step": 1328 }, { "epoch": 2.1449275362318843, "grad_norm": 0.30859375, "learning_rate": 0.00023842452630875216, "loss": 0.373, "step": 1332 }, { "epoch": 2.151368760064412, "grad_norm": 0.296875, "learning_rate": 0.0002380703008332643, "loss": 0.4218, "step": 1336 }, { "epoch": 2.1578099838969402, "grad_norm": 0.306640625, "learning_rate": 0.0002377153242431708, "loss": 0.4234, "step": 1340 }, { "epoch": 2.1642512077294684, "grad_norm": 0.31640625, "learning_rate": 0.00023735959956591786, "loss": 0.3971, "step": 1344 }, { "epoch": 2.1706924315619966, "grad_norm": 0.322265625, "learning_rate": 0.0002370031298353319, "loss": 0.4211, "step": 1348 }, { "epoch": 2.177133655394525, "grad_norm": 0.2890625, "learning_rate": 0.00023664591809159353, "loss": 0.3972, "step": 1352 }, { "epoch": 2.183574879227053, "grad_norm": 0.306640625, "learning_rate": 0.00023628796738121169, "loss": 0.4185, "step": 1356 }, { "epoch": 2.1900161030595813, "grad_norm": 0.29296875, "learning_rate": 0.00023592928075699763, "loss": 0.402, "step": 1360 }, { "epoch": 2.1964573268921095, "grad_norm": 0.30078125, "learning_rate": 0.00023556986127803894, "loss": 0.4056, "step": 1364 }, { "epoch": 2.2028985507246377, "grad_norm": 0.345703125, "learning_rate": 0.00023520971200967334, "loss": 0.4506, "step": 1368 }, { "epoch": 2.209339774557166, "grad_norm": 0.302734375, "learning_rate": 0.00023484883602346274, "loss": 0.4093, "step": 1372 }, { "epoch": 2.215780998389694, "grad_norm": 0.310546875, "learning_rate": 0.0002344872363971668, "loss": 0.4717, "step": 1376 }, { "epoch": 2.2222222222222223, "grad_norm": 0.298828125, "learning_rate": 0.00023412491621471694, "loss": 0.3948, "step": 1380 }, { "epoch": 2.2286634460547505, "grad_norm": 0.29296875, "learning_rate": 0.00023376187856618972, "loss": 0.3925, "step": 1384 }, { "epoch": 2.2351046698872787, "grad_norm": 0.283203125, "learning_rate": 0.00023339812654778083, "loss": 0.4324, "step": 1388 }, { "epoch": 2.241545893719807, "grad_norm": 0.314453125, "learning_rate": 0.0002330336632617784, "loss": 0.4557, "step": 1392 }, { "epoch": 2.247987117552335, "grad_norm": 0.310546875, "learning_rate": 0.00023266849181653683, "loss": 0.4301, "step": 1396 }, { "epoch": 2.2544283413848634, "grad_norm": 0.275390625, "learning_rate": 0.00023230261532644985, "loss": 0.3799, "step": 1400 }, { "epoch": 2.260869565217391, "grad_norm": 0.287109375, "learning_rate": 0.0002319360369119245, "loss": 0.3826, "step": 1404 }, { "epoch": 2.2673107890499193, "grad_norm": 0.291015625, "learning_rate": 0.00023156875969935405, "loss": 0.3862, "step": 1408 }, { "epoch": 2.2737520128824475, "grad_norm": 0.28125, "learning_rate": 0.00023120078682109158, "loss": 0.4269, "step": 1412 }, { "epoch": 2.2801932367149758, "grad_norm": 0.302734375, "learning_rate": 0.00023083212141542328, "loss": 0.4139, "step": 1416 }, { "epoch": 2.286634460547504, "grad_norm": 0.296875, "learning_rate": 0.00023046276662654143, "loss": 0.3579, "step": 1420 }, { "epoch": 2.293075684380032, "grad_norm": 0.287109375, "learning_rate": 0.00023009272560451803, "loss": 0.4, "step": 1424 }, { "epoch": 2.2995169082125604, "grad_norm": 0.298828125, "learning_rate": 0.00022972200150527745, "loss": 0.3937, "step": 1428 }, { "epoch": 2.3059581320450886, "grad_norm": 0.318359375, "learning_rate": 0.00022935059749056992, "loss": 0.4553, "step": 1432 }, { "epoch": 2.312399355877617, "grad_norm": 0.259765625, "learning_rate": 0.00022897851672794417, "loss": 0.396, "step": 1436 }, { "epoch": 2.318840579710145, "grad_norm": 0.3046875, "learning_rate": 0.00022860576239072084, "loss": 0.5137, "step": 1440 }, { "epoch": 2.325281803542673, "grad_norm": 0.28515625, "learning_rate": 0.00022823233765796502, "loss": 0.4085, "step": 1444 }, { "epoch": 2.3317230273752014, "grad_norm": 0.302734375, "learning_rate": 0.0002278582457144595, "loss": 0.3963, "step": 1448 }, { "epoch": 2.3381642512077296, "grad_norm": 0.306640625, "learning_rate": 0.00022748348975067733, "loss": 0.4377, "step": 1452 }, { "epoch": 2.3446054750402574, "grad_norm": 0.322265625, "learning_rate": 0.00022710807296275472, "loss": 0.4275, "step": 1456 }, { "epoch": 2.3510466988727856, "grad_norm": 0.310546875, "learning_rate": 0.0002267319985524637, "loss": 0.4089, "step": 1460 }, { "epoch": 2.357487922705314, "grad_norm": 0.30859375, "learning_rate": 0.00022635526972718508, "loss": 0.4386, "step": 1464 }, { "epoch": 2.363929146537842, "grad_norm": 0.294921875, "learning_rate": 0.0002259778896998807, "loss": 0.4172, "step": 1468 }, { "epoch": 2.3703703703703702, "grad_norm": 0.34375, "learning_rate": 0.00022559986168906637, "loss": 0.4022, "step": 1472 }, { "epoch": 2.3768115942028984, "grad_norm": 0.2890625, "learning_rate": 0.00022522118891878418, "loss": 0.4665, "step": 1476 }, { "epoch": 2.3832528180354267, "grad_norm": 0.3125, "learning_rate": 0.00022484187461857517, "loss": 0.3916, "step": 1480 }, { "epoch": 2.389694041867955, "grad_norm": 0.306640625, "learning_rate": 0.00022446192202345156, "loss": 0.3918, "step": 1484 }, { "epoch": 2.396135265700483, "grad_norm": 0.29296875, "learning_rate": 0.00022408133437386968, "loss": 0.4198, "step": 1488 }, { "epoch": 2.4025764895330113, "grad_norm": 0.306640625, "learning_rate": 0.00022370011491570162, "loss": 0.3635, "step": 1492 }, { "epoch": 2.4090177133655395, "grad_norm": 0.310546875, "learning_rate": 0.000223318266900208, "loss": 0.4297, "step": 1496 }, { "epoch": 2.4154589371980677, "grad_norm": 0.29296875, "learning_rate": 0.00022293579358401023, "loss": 0.3819, "step": 1500 }, { "epoch": 2.421900161030596, "grad_norm": 0.294921875, "learning_rate": 0.0002225526982290625, "loss": 0.4068, "step": 1504 }, { "epoch": 2.428341384863124, "grad_norm": 0.328125, "learning_rate": 0.00022216898410262428, "loss": 0.3808, "step": 1508 }, { "epoch": 2.4347826086956523, "grad_norm": 0.298828125, "learning_rate": 0.00022178465447723214, "loss": 0.4037, "step": 1512 }, { "epoch": 2.4412238325281805, "grad_norm": 0.33203125, "learning_rate": 0.000221399712630672, "loss": 0.452, "step": 1516 }, { "epoch": 2.4476650563607087, "grad_norm": 0.296875, "learning_rate": 0.0002210141618459513, "loss": 0.4127, "step": 1520 }, { "epoch": 2.454106280193237, "grad_norm": 0.27734375, "learning_rate": 0.00022062800541127064, "loss": 0.3894, "step": 1524 }, { "epoch": 2.4605475040257647, "grad_norm": 0.296875, "learning_rate": 0.00022024124661999613, "loss": 0.4256, "step": 1528 }, { "epoch": 2.466988727858293, "grad_norm": 0.318359375, "learning_rate": 0.00021985388877063104, "loss": 0.4556, "step": 1532 }, { "epoch": 2.473429951690821, "grad_norm": 0.31640625, "learning_rate": 0.00021946593516678777, "loss": 0.4504, "step": 1536 }, { "epoch": 2.4798711755233493, "grad_norm": 0.322265625, "learning_rate": 0.00021907738911715964, "loss": 0.4062, "step": 1540 }, { "epoch": 2.4863123993558776, "grad_norm": 0.298828125, "learning_rate": 0.00021868825393549275, "loss": 0.4386, "step": 1544 }, { "epoch": 2.4927536231884058, "grad_norm": 0.30859375, "learning_rate": 0.0002182985329405576, "loss": 0.3559, "step": 1548 }, { "epoch": 2.499194847020934, "grad_norm": 0.28515625, "learning_rate": 0.00021790822945612088, "loss": 0.4244, "step": 1552 }, { "epoch": 2.505636070853462, "grad_norm": 0.314453125, "learning_rate": 0.0002175173468109171, "loss": 0.4028, "step": 1556 }, { "epoch": 2.5120772946859904, "grad_norm": 0.265625, "learning_rate": 0.00021712588833862014, "loss": 0.3726, "step": 1560 }, { "epoch": 2.5185185185185186, "grad_norm": 0.322265625, "learning_rate": 0.00021673385737781492, "loss": 0.481, "step": 1564 }, { "epoch": 2.524959742351047, "grad_norm": 0.30078125, "learning_rate": 0.00021634125727196883, "loss": 0.3778, "step": 1568 }, { "epoch": 2.531400966183575, "grad_norm": 0.296875, "learning_rate": 0.00021594809136940327, "loss": 0.4438, "step": 1572 }, { "epoch": 2.537842190016103, "grad_norm": 0.328125, "learning_rate": 0.00021555436302326514, "loss": 0.4399, "step": 1576 }, { "epoch": 2.544283413848631, "grad_norm": 0.3046875, "learning_rate": 0.00021516007559149803, "loss": 0.3979, "step": 1580 }, { "epoch": 2.550724637681159, "grad_norm": 0.310546875, "learning_rate": 0.00021476523243681397, "loss": 0.4085, "step": 1584 }, { "epoch": 2.5571658615136874, "grad_norm": 0.291015625, "learning_rate": 0.0002143698369266643, "loss": 0.3875, "step": 1588 }, { "epoch": 2.5636070853462156, "grad_norm": 0.287109375, "learning_rate": 0.0002139738924332113, "loss": 0.4288, "step": 1592 }, { "epoch": 2.570048309178744, "grad_norm": 0.3046875, "learning_rate": 0.0002135774023332992, "loss": 0.4155, "step": 1596 }, { "epoch": 2.576489533011272, "grad_norm": 0.2890625, "learning_rate": 0.00021318037000842558, "loss": 0.377, "step": 1600 }, { "epoch": 2.5829307568438002, "grad_norm": 0.326171875, "learning_rate": 0.00021278279884471242, "loss": 0.4134, "step": 1604 }, { "epoch": 2.5893719806763285, "grad_norm": 0.3203125, "learning_rate": 0.0002123846922328771, "loss": 0.3668, "step": 1608 }, { "epoch": 2.5958132045088567, "grad_norm": 0.31640625, "learning_rate": 0.00021198605356820377, "loss": 0.4207, "step": 1612 }, { "epoch": 2.602254428341385, "grad_norm": 0.318359375, "learning_rate": 0.00021158688625051416, "loss": 0.434, "step": 1616 }, { "epoch": 2.608695652173913, "grad_norm": 0.298828125, "learning_rate": 0.00021118719368413866, "loss": 0.3963, "step": 1620 }, { "epoch": 2.6151368760064413, "grad_norm": 0.33984375, "learning_rate": 0.0002107869792778873, "loss": 0.4366, "step": 1624 }, { "epoch": 2.6215780998389695, "grad_norm": 0.28125, "learning_rate": 0.00021038624644502063, "loss": 0.3604, "step": 1628 }, { "epoch": 2.6280193236714977, "grad_norm": 0.3046875, "learning_rate": 0.00020998499860322073, "loss": 0.4029, "step": 1632 }, { "epoch": 2.634460547504026, "grad_norm": 0.30859375, "learning_rate": 0.00020958323917456186, "loss": 0.429, "step": 1636 }, { "epoch": 2.640901771336554, "grad_norm": 0.326171875, "learning_rate": 0.00020918097158548145, "loss": 0.426, "step": 1640 }, { "epoch": 2.6473429951690823, "grad_norm": 0.318359375, "learning_rate": 0.0002087781992667509, "loss": 0.4129, "step": 1644 }, { "epoch": 2.6537842190016105, "grad_norm": 0.3125, "learning_rate": 0.000208374925653446, "loss": 0.3946, "step": 1648 }, { "epoch": 2.6602254428341388, "grad_norm": 0.32421875, "learning_rate": 0.00020797115418491816, "loss": 0.3564, "step": 1652 }, { "epoch": 2.6666666666666665, "grad_norm": 0.32421875, "learning_rate": 0.00020756688830476453, "loss": 0.4553, "step": 1656 }, { "epoch": 2.6731078904991947, "grad_norm": 0.318359375, "learning_rate": 0.0002071621314607991, "loss": 0.4497, "step": 1660 }, { "epoch": 2.679549114331723, "grad_norm": 0.30859375, "learning_rate": 0.00020675688710502293, "loss": 0.3987, "step": 1664 }, { "epoch": 2.685990338164251, "grad_norm": 0.306640625, "learning_rate": 0.00020635115869359498, "loss": 0.4695, "step": 1668 }, { "epoch": 2.6924315619967794, "grad_norm": 0.318359375, "learning_rate": 0.0002059449496868024, "loss": 0.4566, "step": 1672 }, { "epoch": 2.6988727858293076, "grad_norm": 0.314453125, "learning_rate": 0.00020553826354903121, "loss": 0.4199, "step": 1676 }, { "epoch": 2.7053140096618358, "grad_norm": 0.30078125, "learning_rate": 0.00020513110374873676, "loss": 0.3612, "step": 1680 }, { "epoch": 2.711755233494364, "grad_norm": 0.294921875, "learning_rate": 0.00020472347375841384, "loss": 0.383, "step": 1684 }, { "epoch": 2.718196457326892, "grad_norm": 0.294921875, "learning_rate": 0.0002043153770545675, "loss": 0.4051, "step": 1688 }, { "epoch": 2.7246376811594204, "grad_norm": 0.31640625, "learning_rate": 0.00020390681711768312, "loss": 0.4408, "step": 1692 }, { "epoch": 2.7310789049919486, "grad_norm": 0.3203125, "learning_rate": 0.00020349779743219682, "loss": 0.4155, "step": 1696 }, { "epoch": 2.7375201288244764, "grad_norm": 0.294921875, "learning_rate": 0.0002030883214864657, "loss": 0.4164, "step": 1700 }, { "epoch": 2.7439613526570046, "grad_norm": 0.28515625, "learning_rate": 0.0002026783927727381, "loss": 0.4013, "step": 1704 }, { "epoch": 2.750402576489533, "grad_norm": 0.328125, "learning_rate": 0.00020226801478712383, "loss": 0.3839, "step": 1708 }, { "epoch": 2.756843800322061, "grad_norm": 0.296875, "learning_rate": 0.00020185719102956438, "loss": 0.4691, "step": 1712 }, { "epoch": 2.763285024154589, "grad_norm": 0.3125, "learning_rate": 0.0002014459250038031, "loss": 0.3949, "step": 1716 }, { "epoch": 2.7697262479871174, "grad_norm": 0.298828125, "learning_rate": 0.00020103422021735507, "loss": 0.3918, "step": 1720 }, { "epoch": 2.7761674718196456, "grad_norm": 0.326171875, "learning_rate": 0.00020062208018147755, "loss": 0.4027, "step": 1724 }, { "epoch": 2.782608695652174, "grad_norm": 0.296875, "learning_rate": 0.00020020950841113984, "loss": 0.4319, "step": 1728 }, { "epoch": 2.789049919484702, "grad_norm": 0.3359375, "learning_rate": 0.00019979650842499324, "loss": 0.4255, "step": 1732 }, { "epoch": 2.7954911433172303, "grad_norm": 0.294921875, "learning_rate": 0.00019938308374534115, "loss": 0.4403, "step": 1736 }, { "epoch": 2.8019323671497585, "grad_norm": 0.306640625, "learning_rate": 0.00019896923789810905, "loss": 0.4311, "step": 1740 }, { "epoch": 2.8083735909822867, "grad_norm": 0.279296875, "learning_rate": 0.00019855497441281436, "loss": 0.407, "step": 1744 }, { "epoch": 2.814814814814815, "grad_norm": 0.287109375, "learning_rate": 0.00019814029682253644, "loss": 0.4184, "step": 1748 }, { "epoch": 2.821256038647343, "grad_norm": 0.283203125, "learning_rate": 0.00019772520866388605, "loss": 0.3812, "step": 1752 }, { "epoch": 2.8276972624798713, "grad_norm": 0.34375, "learning_rate": 0.00019730971347697602, "loss": 0.4228, "step": 1756 }, { "epoch": 2.8341384863123995, "grad_norm": 0.306640625, "learning_rate": 0.00019689381480539014, "loss": 0.4321, "step": 1760 }, { "epoch": 2.8405797101449277, "grad_norm": 0.31640625, "learning_rate": 0.00019647751619615353, "loss": 0.4321, "step": 1764 }, { "epoch": 2.847020933977456, "grad_norm": 0.3046875, "learning_rate": 0.00019606082119970214, "loss": 0.4502, "step": 1768 }, { "epoch": 2.853462157809984, "grad_norm": 0.306640625, "learning_rate": 0.00019564373336985268, "loss": 0.4298, "step": 1772 }, { "epoch": 2.8599033816425123, "grad_norm": 0.318359375, "learning_rate": 0.00019522625626377198, "loss": 0.4469, "step": 1776 }, { "epoch": 2.86634460547504, "grad_norm": 0.34375, "learning_rate": 0.00019480839344194695, "loss": 0.4033, "step": 1780 }, { "epoch": 2.8727858293075683, "grad_norm": 0.296875, "learning_rate": 0.00019439014846815413, "loss": 0.4381, "step": 1784 }, { "epoch": 2.8792270531400965, "grad_norm": 0.296875, "learning_rate": 0.00019397152490942919, "loss": 0.4205, "step": 1788 }, { "epoch": 2.8856682769726247, "grad_norm": 0.30078125, "learning_rate": 0.00019355252633603668, "loss": 0.4187, "step": 1792 }, { "epoch": 2.892109500805153, "grad_norm": 0.33203125, "learning_rate": 0.00019313315632143944, "loss": 0.3912, "step": 1796 }, { "epoch": 2.898550724637681, "grad_norm": 0.33203125, "learning_rate": 0.00019271341844226812, "loss": 0.4236, "step": 1800 }, { "epoch": 2.9049919484702094, "grad_norm": 0.28125, "learning_rate": 0.0001922933162782909, "loss": 0.3677, "step": 1804 }, { "epoch": 2.9114331723027376, "grad_norm": 0.310546875, "learning_rate": 0.00019187285341238261, "loss": 0.3979, "step": 1808 }, { "epoch": 2.917874396135266, "grad_norm": 0.298828125, "learning_rate": 0.00019145203343049453, "loss": 0.3967, "step": 1812 }, { "epoch": 2.924315619967794, "grad_norm": 0.30859375, "learning_rate": 0.00019103085992162343, "loss": 0.4128, "step": 1816 }, { "epoch": 2.930756843800322, "grad_norm": 0.298828125, "learning_rate": 0.00019060933647778135, "loss": 0.3968, "step": 1820 }, { "epoch": 2.9371980676328504, "grad_norm": 0.33203125, "learning_rate": 0.00019018746669396464, "loss": 0.4208, "step": 1824 }, { "epoch": 2.943639291465378, "grad_norm": 0.30859375, "learning_rate": 0.00018976525416812358, "loss": 0.413, "step": 1828 }, { "epoch": 2.9500805152979064, "grad_norm": 0.298828125, "learning_rate": 0.00018934270250113135, "loss": 0.4122, "step": 1832 }, { "epoch": 2.9565217391304346, "grad_norm": 0.3359375, "learning_rate": 0.00018891981529675376, "loss": 0.3961, "step": 1836 }, { "epoch": 2.962962962962963, "grad_norm": 0.31640625, "learning_rate": 0.00018849659616161808, "loss": 0.4498, "step": 1840 }, { "epoch": 2.969404186795491, "grad_norm": 0.302734375, "learning_rate": 0.00018807304870518263, "loss": 0.3935, "step": 1844 }, { "epoch": 2.975845410628019, "grad_norm": 0.3125, "learning_rate": 0.00018764917653970567, "loss": 0.4183, "step": 1848 }, { "epoch": 2.9822866344605474, "grad_norm": 0.326171875, "learning_rate": 0.000187224983280215, "loss": 0.4101, "step": 1852 }, { "epoch": 2.9887278582930756, "grad_norm": 0.314453125, "learning_rate": 0.00018680047254447665, "loss": 0.4587, "step": 1856 }, { "epoch": 2.995169082125604, "grad_norm": 0.294921875, "learning_rate": 0.0001863756479529644, "loss": 0.4216, "step": 1860 }, { "epoch": 3.001610305958132, "grad_norm": 0.255859375, "learning_rate": 0.00018595051312882892, "loss": 0.3842, "step": 1864 }, { "epoch": 3.0080515297906603, "grad_norm": 0.28515625, "learning_rate": 0.00018552507169786634, "loss": 0.3189, "step": 1868 }, { "epoch": 3.0144927536231885, "grad_norm": 0.31640625, "learning_rate": 0.00018509932728848804, "loss": 0.3061, "step": 1872 }, { "epoch": 3.0209339774557167, "grad_norm": 0.3046875, "learning_rate": 0.00018467328353168934, "loss": 0.3166, "step": 1876 }, { "epoch": 3.027375201288245, "grad_norm": 0.28125, "learning_rate": 0.00018424694406101838, "loss": 0.3081, "step": 1880 }, { "epoch": 3.033816425120773, "grad_norm": 0.28125, "learning_rate": 0.0001838203125125455, "loss": 0.2944, "step": 1884 }, { "epoch": 3.0402576489533013, "grad_norm": 0.294921875, "learning_rate": 0.00018339339252483196, "loss": 0.285, "step": 1888 }, { "epoch": 3.0466988727858295, "grad_norm": 0.283203125, "learning_rate": 0.00018296618773889912, "loss": 0.2926, "step": 1892 }, { "epoch": 3.0531400966183573, "grad_norm": 0.3125, "learning_rate": 0.000182538701798197, "loss": 0.3019, "step": 1896 }, { "epoch": 3.0595813204508855, "grad_norm": 0.314453125, "learning_rate": 0.00018211093834857379, "loss": 0.2984, "step": 1900 }, { "epoch": 3.0660225442834137, "grad_norm": 0.30859375, "learning_rate": 0.00018168290103824422, "loss": 0.3185, "step": 1904 }, { "epoch": 3.072463768115942, "grad_norm": 0.30859375, "learning_rate": 0.00018125459351775873, "loss": 0.3192, "step": 1908 }, { "epoch": 3.07890499194847, "grad_norm": 0.3125, "learning_rate": 0.00018082601943997232, "loss": 0.3459, "step": 1912 }, { "epoch": 3.0853462157809983, "grad_norm": 0.3203125, "learning_rate": 0.00018039718246001325, "loss": 0.2837, "step": 1916 }, { "epoch": 3.0917874396135265, "grad_norm": 0.32421875, "learning_rate": 0.000179968086235252, "loss": 0.3134, "step": 1920 }, { "epoch": 3.0982286634460547, "grad_norm": 0.30859375, "learning_rate": 0.00017953873442527008, "loss": 0.2907, "step": 1924 }, { "epoch": 3.104669887278583, "grad_norm": 0.314453125, "learning_rate": 0.00017910913069182872, "loss": 0.3076, "step": 1928 }, { "epoch": 3.111111111111111, "grad_norm": 0.302734375, "learning_rate": 0.00017867927869883775, "loss": 0.293, "step": 1932 }, { "epoch": 3.1175523349436394, "grad_norm": 0.302734375, "learning_rate": 0.00017824918211232422, "loss": 0.2775, "step": 1936 }, { "epoch": 3.1239935587761676, "grad_norm": 0.326171875, "learning_rate": 0.00017781884460040136, "loss": 0.3037, "step": 1940 }, { "epoch": 3.130434782608696, "grad_norm": 0.34375, "learning_rate": 0.00017738826983323703, "loss": 0.3139, "step": 1944 }, { "epoch": 3.136876006441224, "grad_norm": 0.328125, "learning_rate": 0.00017695746148302252, "loss": 0.3081, "step": 1948 }, { "epoch": 3.143317230273752, "grad_norm": 0.302734375, "learning_rate": 0.00017652642322394142, "loss": 0.344, "step": 1952 }, { "epoch": 3.14975845410628, "grad_norm": 0.3125, "learning_rate": 0.00017609515873213787, "loss": 0.3006, "step": 1956 }, { "epoch": 3.156199677938808, "grad_norm": 0.326171875, "learning_rate": 0.00017566367168568572, "loss": 0.2933, "step": 1960 }, { "epoch": 3.1626409017713364, "grad_norm": 0.30078125, "learning_rate": 0.00017523196576455663, "loss": 0.2869, "step": 1964 }, { "epoch": 3.1690821256038646, "grad_norm": 0.322265625, "learning_rate": 0.00017480004465058918, "loss": 0.2935, "step": 1968 }, { "epoch": 3.175523349436393, "grad_norm": 0.291015625, "learning_rate": 0.00017436791202745706, "loss": 0.3451, "step": 1972 }, { "epoch": 3.181964573268921, "grad_norm": 0.3125, "learning_rate": 0.00017393557158063803, "loss": 0.3047, "step": 1976 }, { "epoch": 3.1884057971014492, "grad_norm": 0.32421875, "learning_rate": 0.00017350302699738204, "loss": 0.327, "step": 1980 }, { "epoch": 3.1948470209339774, "grad_norm": 0.337890625, "learning_rate": 0.00017307028196668028, "loss": 0.3238, "step": 1984 }, { "epoch": 3.2012882447665056, "grad_norm": 0.326171875, "learning_rate": 0.0001726373401792333, "loss": 0.2957, "step": 1988 }, { "epoch": 3.207729468599034, "grad_norm": 0.296875, "learning_rate": 0.00017220420532741977, "loss": 0.3124, "step": 1992 }, { "epoch": 3.214170692431562, "grad_norm": 0.34375, "learning_rate": 0.00017177088110526486, "loss": 0.2852, "step": 1996 }, { "epoch": 3.2206119162640903, "grad_norm": 0.29296875, "learning_rate": 0.00017133737120840907, "loss": 0.3084, "step": 2000 }, { "epoch": 3.2270531400966185, "grad_norm": 0.310546875, "learning_rate": 0.000170903679334076, "loss": 0.2671, "step": 2004 }, { "epoch": 3.2334943639291467, "grad_norm": 0.291015625, "learning_rate": 0.00017046980918104164, "loss": 0.2851, "step": 2008 }, { "epoch": 3.239935587761675, "grad_norm": 0.314453125, "learning_rate": 0.0001700357644496022, "loss": 0.2921, "step": 2012 }, { "epoch": 3.246376811594203, "grad_norm": 0.3359375, "learning_rate": 0.00016960154884154298, "loss": 0.2898, "step": 2016 }, { "epoch": 3.2528180354267313, "grad_norm": 0.3203125, "learning_rate": 0.00016916716606010646, "loss": 0.3277, "step": 2020 }, { "epoch": 3.259259259259259, "grad_norm": 0.3046875, "learning_rate": 0.00016873261980996095, "loss": 0.3301, "step": 2024 }, { "epoch": 3.2657004830917873, "grad_norm": 0.306640625, "learning_rate": 0.00016829791379716896, "loss": 0.3639, "step": 2028 }, { "epoch": 3.2721417069243155, "grad_norm": 0.353515625, "learning_rate": 0.00016786305172915544, "loss": 0.3492, "step": 2032 }, { "epoch": 3.2785829307568437, "grad_norm": 0.3359375, "learning_rate": 0.0001674280373146763, "loss": 0.3233, "step": 2036 }, { "epoch": 3.285024154589372, "grad_norm": 0.32421875, "learning_rate": 0.00016699287426378683, "loss": 0.3232, "step": 2040 }, { "epoch": 3.2914653784219, "grad_norm": 0.31640625, "learning_rate": 0.0001665575662878099, "loss": 0.326, "step": 2044 }, { "epoch": 3.2979066022544283, "grad_norm": 0.337890625, "learning_rate": 0.00016612211709930442, "loss": 0.3169, "step": 2048 }, { "epoch": 3.3043478260869565, "grad_norm": 0.3046875, "learning_rate": 0.00016568653041203356, "loss": 0.3036, "step": 2052 }, { "epoch": 3.3107890499194848, "grad_norm": 0.33203125, "learning_rate": 0.00016525080994093328, "loss": 0.2987, "step": 2056 }, { "epoch": 3.317230273752013, "grad_norm": 0.30859375, "learning_rate": 0.00016481495940208046, "loss": 0.3108, "step": 2060 }, { "epoch": 3.323671497584541, "grad_norm": 0.330078125, "learning_rate": 0.0001643789825126613, "loss": 0.3119, "step": 2064 }, { "epoch": 3.3301127214170694, "grad_norm": 0.330078125, "learning_rate": 0.0001639428829909396, "loss": 0.342, "step": 2068 }, { "epoch": 3.3365539452495976, "grad_norm": 0.3046875, "learning_rate": 0.00016350666455622497, "loss": 0.3025, "step": 2072 }, { "epoch": 3.342995169082126, "grad_norm": 0.337890625, "learning_rate": 0.0001630703309288412, "loss": 0.3136, "step": 2076 }, { "epoch": 3.3494363929146536, "grad_norm": 0.365234375, "learning_rate": 0.00016263388583009463, "loss": 0.2957, "step": 2080 }, { "epoch": 3.3558776167471818, "grad_norm": 0.341796875, "learning_rate": 0.0001621973329822421, "loss": 0.2948, "step": 2084 }, { "epoch": 3.36231884057971, "grad_norm": 0.318359375, "learning_rate": 0.00016176067610845958, "loss": 0.3298, "step": 2088 }, { "epoch": 3.368760064412238, "grad_norm": 0.35546875, "learning_rate": 0.00016132391893281003, "loss": 0.327, "step": 2092 }, { "epoch": 3.3752012882447664, "grad_norm": 0.326171875, "learning_rate": 0.0001608870651802121, "loss": 0.3009, "step": 2096 }, { "epoch": 3.3816425120772946, "grad_norm": 0.3359375, "learning_rate": 0.00016045011857640783, "loss": 0.3148, "step": 2100 }, { "epoch": 3.388083735909823, "grad_norm": 0.298828125, "learning_rate": 0.0001600130828479314, "loss": 0.3282, "step": 2104 }, { "epoch": 3.394524959742351, "grad_norm": 0.314453125, "learning_rate": 0.0001595759617220769, "loss": 0.3203, "step": 2108 }, { "epoch": 3.4009661835748792, "grad_norm": 0.32421875, "learning_rate": 0.00015913875892686685, "loss": 0.2977, "step": 2112 }, { "epoch": 3.4074074074074074, "grad_norm": 0.34765625, "learning_rate": 0.00015870147819102025, "loss": 0.2806, "step": 2116 }, { "epoch": 3.4138486312399356, "grad_norm": 0.337890625, "learning_rate": 0.00015826412324392085, "loss": 0.3096, "step": 2120 }, { "epoch": 3.420289855072464, "grad_norm": 0.3203125, "learning_rate": 0.00015782669781558528, "loss": 0.301, "step": 2124 }, { "epoch": 3.426731078904992, "grad_norm": 0.34375, "learning_rate": 0.00015738920563663136, "loss": 0.3055, "step": 2128 }, { "epoch": 3.4331723027375203, "grad_norm": 0.349609375, "learning_rate": 0.00015695165043824605, "loss": 0.3187, "step": 2132 }, { "epoch": 3.4396135265700485, "grad_norm": 0.322265625, "learning_rate": 0.00015651403595215392, "loss": 0.308, "step": 2136 }, { "epoch": 3.4460547504025767, "grad_norm": 0.34375, "learning_rate": 0.00015607636591058506, "loss": 0.3033, "step": 2140 }, { "epoch": 3.452495974235105, "grad_norm": 0.32421875, "learning_rate": 0.0001556386440462435, "loss": 0.3313, "step": 2144 }, { "epoch": 3.4589371980676327, "grad_norm": 0.318359375, "learning_rate": 0.0001552008740922751, "loss": 0.2891, "step": 2148 }, { "epoch": 3.465378421900161, "grad_norm": 0.345703125, "learning_rate": 0.00015476305978223606, "loss": 0.3416, "step": 2152 }, { "epoch": 3.471819645732689, "grad_norm": 0.33203125, "learning_rate": 0.00015432520485006055, "loss": 0.2768, "step": 2156 }, { "epoch": 3.4782608695652173, "grad_norm": 0.345703125, "learning_rate": 0.00015388731303002954, "loss": 0.3216, "step": 2160 }, { "epoch": 3.4847020933977455, "grad_norm": 0.33984375, "learning_rate": 0.0001534493880567384, "loss": 0.3112, "step": 2164 }, { "epoch": 3.4911433172302737, "grad_norm": 0.314453125, "learning_rate": 0.00015301143366506527, "loss": 0.323, "step": 2168 }, { "epoch": 3.497584541062802, "grad_norm": 0.330078125, "learning_rate": 0.00015257345359013928, "loss": 0.3406, "step": 2172 }, { "epoch": 3.50402576489533, "grad_norm": 0.330078125, "learning_rate": 0.00015213545156730847, "loss": 0.2904, "step": 2176 }, { "epoch": 3.5104669887278583, "grad_norm": 0.333984375, "learning_rate": 0.00015169743133210814, "loss": 0.3107, "step": 2180 }, { "epoch": 3.5169082125603865, "grad_norm": 0.357421875, "learning_rate": 0.0001512593966202289, "loss": 0.3377, "step": 2184 }, { "epoch": 3.5233494363929148, "grad_norm": 0.390625, "learning_rate": 0.00015082135116748483, "loss": 0.3491, "step": 2188 }, { "epoch": 3.529790660225443, "grad_norm": 0.33984375, "learning_rate": 0.00015038329870978168, "loss": 0.2865, "step": 2192 }, { "epoch": 3.536231884057971, "grad_norm": 0.337890625, "learning_rate": 0.00014994524298308479, "loss": 0.2913, "step": 2196 }, { "epoch": 3.542673107890499, "grad_norm": 0.35546875, "learning_rate": 0.0001495071877233875, "loss": 0.3163, "step": 2200 }, { "epoch": 3.549114331723027, "grad_norm": 0.337890625, "learning_rate": 0.00014906913666667913, "loss": 0.2722, "step": 2204 }, { "epoch": 3.5555555555555554, "grad_norm": 0.302734375, "learning_rate": 0.00014863109354891317, "loss": 0.3163, "step": 2208 }, { "epoch": 3.5619967793880836, "grad_norm": 0.33203125, "learning_rate": 0.00014819306210597536, "loss": 0.3735, "step": 2212 }, { "epoch": 3.5684380032206118, "grad_norm": 0.35546875, "learning_rate": 0.00014775504607365196, "loss": 0.3303, "step": 2216 }, { "epoch": 3.57487922705314, "grad_norm": 0.3203125, "learning_rate": 0.00014731704918759765, "loss": 0.2946, "step": 2220 }, { "epoch": 3.581320450885668, "grad_norm": 0.318359375, "learning_rate": 0.000146879075183304, "loss": 0.3434, "step": 2224 }, { "epoch": 3.5877616747181964, "grad_norm": 0.36328125, "learning_rate": 0.00014644112779606727, "loss": 0.3063, "step": 2228 }, { "epoch": 3.5942028985507246, "grad_norm": 0.333984375, "learning_rate": 0.00014600321076095683, "loss": 0.2962, "step": 2232 }, { "epoch": 3.600644122383253, "grad_norm": 0.34765625, "learning_rate": 0.00014556532781278316, "loss": 0.3006, "step": 2236 }, { "epoch": 3.607085346215781, "grad_norm": 0.341796875, "learning_rate": 0.00014512748268606592, "loss": 0.3688, "step": 2240 }, { "epoch": 3.6135265700483092, "grad_norm": 0.33203125, "learning_rate": 0.00014468967911500242, "loss": 0.3348, "step": 2244 }, { "epoch": 3.6199677938808374, "grad_norm": 0.330078125, "learning_rate": 0.0001442519208334353, "loss": 0.3128, "step": 2248 }, { "epoch": 3.6264090177133657, "grad_norm": 0.341796875, "learning_rate": 0.00014381421157482125, "loss": 0.3488, "step": 2252 }, { "epoch": 3.632850241545894, "grad_norm": 0.31640625, "learning_rate": 0.0001433765550721985, "loss": 0.2614, "step": 2256 }, { "epoch": 3.639291465378422, "grad_norm": 0.34765625, "learning_rate": 0.00014293895505815575, "loss": 0.2984, "step": 2260 }, { "epoch": 3.6457326892109503, "grad_norm": 0.31640625, "learning_rate": 0.00014250141526479953, "loss": 0.3257, "step": 2264 }, { "epoch": 3.6521739130434785, "grad_norm": 0.330078125, "learning_rate": 0.00014206393942372314, "loss": 0.3235, "step": 2268 }, { "epoch": 3.6586151368760067, "grad_norm": 0.357421875, "learning_rate": 0.0001416265312659741, "loss": 0.3435, "step": 2272 }, { "epoch": 3.6650563607085345, "grad_norm": 0.32421875, "learning_rate": 0.00014118919452202306, "loss": 0.3191, "step": 2276 }, { "epoch": 3.6714975845410627, "grad_norm": 0.318359375, "learning_rate": 0.00014075193292173126, "loss": 0.2869, "step": 2280 }, { "epoch": 3.677938808373591, "grad_norm": 0.318359375, "learning_rate": 0.00014031475019431934, "loss": 0.3089, "step": 2284 }, { "epoch": 3.684380032206119, "grad_norm": 0.322265625, "learning_rate": 0.00013987765006833518, "loss": 0.3332, "step": 2288 }, { "epoch": 3.6908212560386473, "grad_norm": 0.30859375, "learning_rate": 0.0001394406362716221, "loss": 0.3127, "step": 2292 }, { "epoch": 3.6972624798711755, "grad_norm": 0.32421875, "learning_rate": 0.00013900371253128727, "loss": 0.3177, "step": 2296 }, { "epoch": 3.7037037037037037, "grad_norm": 0.337890625, "learning_rate": 0.0001385668825736697, "loss": 0.3324, "step": 2300 }, { "epoch": 3.710144927536232, "grad_norm": 0.32421875, "learning_rate": 0.0001381301501243087, "loss": 0.2785, "step": 2304 }, { "epoch": 3.71658615136876, "grad_norm": 0.353515625, "learning_rate": 0.00013769351890791185, "loss": 0.3274, "step": 2308 }, { "epoch": 3.7230273752012883, "grad_norm": 0.3359375, "learning_rate": 0.00013725699264832344, "loss": 0.3041, "step": 2312 }, { "epoch": 3.7294685990338166, "grad_norm": 0.326171875, "learning_rate": 0.00013682057506849256, "loss": 0.3343, "step": 2316 }, { "epoch": 3.7359098228663448, "grad_norm": 0.3125, "learning_rate": 0.00013638426989044148, "loss": 0.2785, "step": 2320 }, { "epoch": 3.7423510466988725, "grad_norm": 0.34375, "learning_rate": 0.00013594808083523376, "loss": 0.3454, "step": 2324 }, { "epoch": 3.7487922705314007, "grad_norm": 0.33203125, "learning_rate": 0.00013551201162294275, "loss": 0.312, "step": 2328 }, { "epoch": 3.755233494363929, "grad_norm": 0.3359375, "learning_rate": 0.00013507606597261946, "loss": 0.2885, "step": 2332 }, { "epoch": 3.761674718196457, "grad_norm": 0.337890625, "learning_rate": 0.00013464024760226142, "loss": 0.3328, "step": 2336 }, { "epoch": 3.7681159420289854, "grad_norm": 0.33203125, "learning_rate": 0.0001342045602287803, "loss": 0.3078, "step": 2340 }, { "epoch": 3.7745571658615136, "grad_norm": 0.326171875, "learning_rate": 0.00013376900756797085, "loss": 0.3126, "step": 2344 }, { "epoch": 3.780998389694042, "grad_norm": 0.3125, "learning_rate": 0.00013333359333447865, "loss": 0.2941, "step": 2348 }, { "epoch": 3.78743961352657, "grad_norm": 0.353515625, "learning_rate": 0.0001328983212417689, "loss": 0.3251, "step": 2352 }, { "epoch": 3.793880837359098, "grad_norm": 0.341796875, "learning_rate": 0.0001324631950020945, "loss": 0.3367, "step": 2356 }, { "epoch": 3.8003220611916264, "grad_norm": 0.365234375, "learning_rate": 0.0001320282183264643, "loss": 0.3164, "step": 2360 }, { "epoch": 3.8067632850241546, "grad_norm": 0.353515625, "learning_rate": 0.00013159339492461176, "loss": 0.3584, "step": 2364 }, { "epoch": 3.813204508856683, "grad_norm": 0.34375, "learning_rate": 0.00013115872850496293, "loss": 0.3307, "step": 2368 }, { "epoch": 3.819645732689211, "grad_norm": 0.33984375, "learning_rate": 0.0001307242227746053, "loss": 0.3475, "step": 2372 }, { "epoch": 3.8260869565217392, "grad_norm": 0.345703125, "learning_rate": 0.00013028988143925553, "loss": 0.3058, "step": 2376 }, { "epoch": 3.8325281803542675, "grad_norm": 0.345703125, "learning_rate": 0.00012985570820322868, "loss": 0.2718, "step": 2380 }, { "epoch": 3.8389694041867957, "grad_norm": 0.333984375, "learning_rate": 0.00012942170676940576, "loss": 0.3074, "step": 2384 }, { "epoch": 3.845410628019324, "grad_norm": 0.32421875, "learning_rate": 0.00012898788083920282, "loss": 0.3177, "step": 2388 }, { "epoch": 3.851851851851852, "grad_norm": 0.341796875, "learning_rate": 0.0001285542341125389, "loss": 0.3012, "step": 2392 }, { "epoch": 3.8582930756843803, "grad_norm": 0.30859375, "learning_rate": 0.0001281207702878049, "loss": 0.3024, "step": 2396 }, { "epoch": 3.864734299516908, "grad_norm": 0.328125, "learning_rate": 0.00012768749306183165, "loss": 0.3092, "step": 2400 }, { "epoch": 3.8711755233494363, "grad_norm": 0.34375, "learning_rate": 0.00012725440612985868, "loss": 0.2978, "step": 2404 }, { "epoch": 3.8776167471819645, "grad_norm": 0.337890625, "learning_rate": 0.0001268215131855025, "loss": 0.3337, "step": 2408 }, { "epoch": 3.8840579710144927, "grad_norm": 0.314453125, "learning_rate": 0.00012638881792072522, "loss": 0.3278, "step": 2412 }, { "epoch": 3.890499194847021, "grad_norm": 0.318359375, "learning_rate": 0.00012595632402580305, "loss": 0.3051, "step": 2416 }, { "epoch": 3.896940418679549, "grad_norm": 0.310546875, "learning_rate": 0.00012552403518929472, "loss": 0.2764, "step": 2420 }, { "epoch": 3.9033816425120773, "grad_norm": 0.322265625, "learning_rate": 0.0001250919550980102, "loss": 0.3124, "step": 2424 }, { "epoch": 3.9098228663446055, "grad_norm": 0.322265625, "learning_rate": 0.00012466008743697906, "loss": 0.3407, "step": 2428 }, { "epoch": 3.9162640901771337, "grad_norm": 0.34375, "learning_rate": 0.00012422843588941925, "loss": 0.3336, "step": 2432 }, { "epoch": 3.922705314009662, "grad_norm": 0.33203125, "learning_rate": 0.00012379700413670547, "loss": 0.2992, "step": 2436 }, { "epoch": 3.92914653784219, "grad_norm": 0.32421875, "learning_rate": 0.00012336579585833798, "loss": 0.3341, "step": 2440 }, { "epoch": 3.9355877616747184, "grad_norm": 0.345703125, "learning_rate": 0.00012293481473191103, "loss": 0.3153, "step": 2444 }, { "epoch": 3.942028985507246, "grad_norm": 0.322265625, "learning_rate": 0.00012250406443308168, "loss": 0.2993, "step": 2448 }, { "epoch": 3.9484702093397743, "grad_norm": 0.328125, "learning_rate": 0.00012207354863553825, "loss": 0.3144, "step": 2452 }, { "epoch": 3.9549114331723025, "grad_norm": 0.330078125, "learning_rate": 0.00012164327101096923, "loss": 0.3251, "step": 2456 }, { "epoch": 3.9613526570048307, "grad_norm": 0.3125, "learning_rate": 0.00012121323522903167, "loss": 0.2799, "step": 2460 }, { "epoch": 3.967793880837359, "grad_norm": 0.330078125, "learning_rate": 0.00012078344495732028, "loss": 0.3188, "step": 2464 }, { "epoch": 3.974235104669887, "grad_norm": 0.333984375, "learning_rate": 0.00012035390386133558, "loss": 0.3052, "step": 2468 }, { "epoch": 3.9806763285024154, "grad_norm": 0.3203125, "learning_rate": 0.00011992461560445337, "loss": 0.2771, "step": 2472 }, { "epoch": 3.9871175523349436, "grad_norm": 0.326171875, "learning_rate": 0.00011949558384789271, "loss": 0.3164, "step": 2476 }, { "epoch": 3.993558776167472, "grad_norm": 0.337890625, "learning_rate": 0.00011906681225068535, "loss": 0.2902, "step": 2480 }, { "epoch": 4.0, "grad_norm": 0.486328125, "learning_rate": 0.00011863830446964417, "loss": 0.3142, "step": 2484 }, { "epoch": 4.006441223832528, "grad_norm": 0.259765625, "learning_rate": 0.00011821006415933199, "loss": 0.2147, "step": 2488 }, { "epoch": 4.012882447665056, "grad_norm": 0.322265625, "learning_rate": 0.00011778209497203062, "loss": 0.2092, "step": 2492 }, { "epoch": 4.019323671497585, "grad_norm": 0.3203125, "learning_rate": 0.00011735440055770945, "loss": 0.2548, "step": 2496 }, { "epoch": 4.025764895330113, "grad_norm": 0.306640625, "learning_rate": 0.00011692698456399458, "loss": 0.2183, "step": 2500 }, { "epoch": 4.032206119162641, "grad_norm": 0.298828125, "learning_rate": 0.0001164998506361374, "loss": 0.2009, "step": 2504 }, { "epoch": 4.038647342995169, "grad_norm": 0.298828125, "learning_rate": 0.00011607300241698387, "loss": 0.218, "step": 2508 }, { "epoch": 4.0450885668276975, "grad_norm": 0.326171875, "learning_rate": 0.00011564644354694312, "loss": 0.2201, "step": 2512 }, { "epoch": 4.051529790660226, "grad_norm": 0.333984375, "learning_rate": 0.00011522017766395665, "loss": 0.2078, "step": 2516 }, { "epoch": 4.057971014492754, "grad_norm": 0.296875, "learning_rate": 0.00011479420840346706, "loss": 0.1932, "step": 2520 }, { "epoch": 4.064412238325282, "grad_norm": 0.30078125, "learning_rate": 0.00011436853939838734, "loss": 0.2217, "step": 2524 }, { "epoch": 4.07085346215781, "grad_norm": 0.30859375, "learning_rate": 0.0001139431742790696, "loss": 0.2448, "step": 2528 }, { "epoch": 4.0772946859903385, "grad_norm": 0.318359375, "learning_rate": 0.0001135181166732743, "loss": 0.2254, "step": 2532 }, { "epoch": 4.083735909822867, "grad_norm": 0.330078125, "learning_rate": 0.00011309337020613922, "loss": 0.2665, "step": 2536 }, { "epoch": 4.090177133655395, "grad_norm": 0.32421875, "learning_rate": 0.0001126689385001486, "loss": 0.2365, "step": 2540 }, { "epoch": 4.096618357487923, "grad_norm": 0.3125, "learning_rate": 0.00011224482517510224, "loss": 0.2341, "step": 2544 }, { "epoch": 4.1030595813204505, "grad_norm": 0.30078125, "learning_rate": 0.00011182103384808444, "loss": 0.2015, "step": 2548 }, { "epoch": 4.109500805152979, "grad_norm": 0.318359375, "learning_rate": 0.00011139756813343359, "loss": 0.2334, "step": 2552 }, { "epoch": 4.115942028985507, "grad_norm": 0.341796875, "learning_rate": 0.00011097443164271075, "loss": 0.246, "step": 2556 }, { "epoch": 4.122383252818035, "grad_norm": 0.34375, "learning_rate": 0.00011055162798466948, "loss": 0.2322, "step": 2560 }, { "epoch": 4.128824476650563, "grad_norm": 0.333984375, "learning_rate": 0.00011012916076522443, "loss": 0.2178, "step": 2564 }, { "epoch": 4.1352657004830915, "grad_norm": 0.291015625, "learning_rate": 0.00010970703358742127, "loss": 0.2147, "step": 2568 }, { "epoch": 4.14170692431562, "grad_norm": 0.333984375, "learning_rate": 0.00010928525005140521, "loss": 0.2315, "step": 2572 }, { "epoch": 4.148148148148148, "grad_norm": 0.33203125, "learning_rate": 0.00010886381375439105, "loss": 0.2284, "step": 2576 }, { "epoch": 4.154589371980676, "grad_norm": 0.3203125, "learning_rate": 0.0001084427282906318, "loss": 0.2568, "step": 2580 }, { "epoch": 4.161030595813204, "grad_norm": 0.314453125, "learning_rate": 0.00010802199725138869, "loss": 0.2163, "step": 2584 }, { "epoch": 4.1674718196457325, "grad_norm": 0.3203125, "learning_rate": 0.00010760162422489987, "loss": 0.2267, "step": 2588 }, { "epoch": 4.173913043478261, "grad_norm": 0.3359375, "learning_rate": 0.00010718161279635048, "loss": 0.2263, "step": 2592 }, { "epoch": 4.180354267310789, "grad_norm": 0.33984375, "learning_rate": 0.00010676196654784144, "loss": 0.2395, "step": 2596 }, { "epoch": 4.186795491143317, "grad_norm": 0.328125, "learning_rate": 0.00010634268905835949, "loss": 0.2454, "step": 2600 }, { "epoch": 4.193236714975845, "grad_norm": 0.310546875, "learning_rate": 0.00010592378390374612, "loss": 0.2186, "step": 2604 }, { "epoch": 4.199677938808374, "grad_norm": 0.33984375, "learning_rate": 0.00010550525465666751, "loss": 0.2302, "step": 2608 }, { "epoch": 4.206119162640902, "grad_norm": 0.328125, "learning_rate": 0.00010508710488658385, "loss": 0.2475, "step": 2612 }, { "epoch": 4.21256038647343, "grad_norm": 0.314453125, "learning_rate": 0.00010466933815971884, "loss": 0.1988, "step": 2616 }, { "epoch": 4.219001610305958, "grad_norm": 0.32421875, "learning_rate": 0.00010425195803902948, "loss": 0.2137, "step": 2620 }, { "epoch": 4.225442834138486, "grad_norm": 0.345703125, "learning_rate": 0.00010383496808417547, "loss": 0.2564, "step": 2624 }, { "epoch": 4.231884057971015, "grad_norm": 0.314453125, "learning_rate": 0.00010341837185148903, "loss": 0.2361, "step": 2628 }, { "epoch": 4.238325281803543, "grad_norm": 0.314453125, "learning_rate": 0.00010300217289394443, "loss": 0.2324, "step": 2632 }, { "epoch": 4.244766505636071, "grad_norm": 0.302734375, "learning_rate": 0.00010258637476112782, "loss": 0.2175, "step": 2636 }, { "epoch": 4.251207729468599, "grad_norm": 0.3203125, "learning_rate": 0.00010217098099920676, "loss": 0.2533, "step": 2640 }, { "epoch": 4.2576489533011275, "grad_norm": 0.31640625, "learning_rate": 0.00010175599515090026, "loss": 0.2155, "step": 2644 }, { "epoch": 4.264090177133656, "grad_norm": 0.3203125, "learning_rate": 0.00010134142075544824, "loss": 0.2299, "step": 2648 }, { "epoch": 4.270531400966184, "grad_norm": 0.3359375, "learning_rate": 0.00010092726134858168, "loss": 0.2776, "step": 2652 }, { "epoch": 4.276972624798712, "grad_norm": 0.345703125, "learning_rate": 0.00010051352046249213, "loss": 0.2079, "step": 2656 }, { "epoch": 4.28341384863124, "grad_norm": 0.328125, "learning_rate": 0.00010010020162580192, "loss": 0.198, "step": 2660 }, { "epoch": 4.2898550724637685, "grad_norm": 0.328125, "learning_rate": 9.96873083635337e-05, "loss": 0.223, "step": 2664 }, { "epoch": 4.296296296296296, "grad_norm": 0.33984375, "learning_rate": 9.927484419708076e-05, "loss": 0.187, "step": 2668 }, { "epoch": 4.302737520128824, "grad_norm": 0.33984375, "learning_rate": 9.88628126441768e-05, "loss": 0.2339, "step": 2672 }, { "epoch": 4.309178743961352, "grad_norm": 0.328125, "learning_rate": 9.84512172188657e-05, "loss": 0.2164, "step": 2676 }, { "epoch": 4.3156199677938805, "grad_norm": 0.30859375, "learning_rate": 9.804006143147212e-05, "loss": 0.2328, "step": 2680 }, { "epoch": 4.322061191626409, "grad_norm": 0.322265625, "learning_rate": 9.762934878857105e-05, "loss": 0.2577, "step": 2684 }, { "epoch": 4.328502415458937, "grad_norm": 0.3203125, "learning_rate": 9.721908279295812e-05, "loss": 0.2256, "step": 2688 }, { "epoch": 4.334943639291465, "grad_norm": 0.361328125, "learning_rate": 9.680926694361964e-05, "loss": 0.2344, "step": 2692 }, { "epoch": 4.341384863123993, "grad_norm": 0.31640625, "learning_rate": 9.639990473570294e-05, "loss": 0.2238, "step": 2696 }, { "epoch": 4.3478260869565215, "grad_norm": 0.30859375, "learning_rate": 9.599099966048627e-05, "loss": 0.1847, "step": 2700 }, { "epoch": 4.35426731078905, "grad_norm": 0.353515625, "learning_rate": 9.558255520534937e-05, "loss": 0.2451, "step": 2704 }, { "epoch": 4.360708534621578, "grad_norm": 0.328125, "learning_rate": 9.517457485374336e-05, "loss": 0.2112, "step": 2708 }, { "epoch": 4.367149758454106, "grad_norm": 0.30078125, "learning_rate": 9.476706208516138e-05, "loss": 0.2048, "step": 2712 }, { "epoch": 4.373590982286634, "grad_norm": 0.32421875, "learning_rate": 9.43600203751086e-05, "loss": 0.2036, "step": 2716 }, { "epoch": 4.3800322061191626, "grad_norm": 0.31640625, "learning_rate": 9.395345319507287e-05, "loss": 0.2125, "step": 2720 }, { "epoch": 4.386473429951691, "grad_norm": 0.328125, "learning_rate": 9.354736401249486e-05, "loss": 0.2199, "step": 2724 }, { "epoch": 4.392914653784219, "grad_norm": 0.341796875, "learning_rate": 9.31417562907387e-05, "loss": 0.2064, "step": 2728 }, { "epoch": 4.399355877616747, "grad_norm": 0.3046875, "learning_rate": 9.273663348906222e-05, "loss": 0.2183, "step": 2732 }, { "epoch": 4.405797101449275, "grad_norm": 0.318359375, "learning_rate": 9.233199906258766e-05, "loss": 0.2639, "step": 2736 }, { "epoch": 4.412238325281804, "grad_norm": 0.3515625, "learning_rate": 9.192785646227217e-05, "loss": 0.251, "step": 2740 }, { "epoch": 4.418679549114332, "grad_norm": 0.35546875, "learning_rate": 9.152420913487814e-05, "loss": 0.2386, "step": 2744 }, { "epoch": 4.42512077294686, "grad_norm": 0.33984375, "learning_rate": 9.112106052294418e-05, "loss": 0.217, "step": 2748 }, { "epoch": 4.431561996779388, "grad_norm": 0.337890625, "learning_rate": 9.071841406475539e-05, "loss": 0.2102, "step": 2752 }, { "epoch": 4.438003220611916, "grad_norm": 0.34765625, "learning_rate": 9.03162731943144e-05, "loss": 0.2282, "step": 2756 }, { "epoch": 4.444444444444445, "grad_norm": 0.302734375, "learning_rate": 8.991464134131166e-05, "loss": 0.2395, "step": 2760 }, { "epoch": 4.450885668276973, "grad_norm": 0.33203125, "learning_rate": 8.951352193109673e-05, "loss": 0.2379, "step": 2764 }, { "epoch": 4.457326892109501, "grad_norm": 0.318359375, "learning_rate": 8.911291838464838e-05, "loss": 0.2319, "step": 2768 }, { "epoch": 4.463768115942029, "grad_norm": 0.3359375, "learning_rate": 8.871283411854619e-05, "loss": 0.2066, "step": 2772 }, { "epoch": 4.4702093397745575, "grad_norm": 0.328125, "learning_rate": 8.831327254494066e-05, "loss": 0.2086, "step": 2776 }, { "epoch": 4.476650563607086, "grad_norm": 0.33984375, "learning_rate": 8.791423707152482e-05, "loss": 0.2454, "step": 2780 }, { "epoch": 4.483091787439614, "grad_norm": 0.31640625, "learning_rate": 8.751573110150443e-05, "loss": 0.2254, "step": 2784 }, { "epoch": 4.489533011272142, "grad_norm": 0.3359375, "learning_rate": 8.711775803356971e-05, "loss": 0.2446, "step": 2788 }, { "epoch": 4.49597423510467, "grad_norm": 0.3125, "learning_rate": 8.672032126186566e-05, "loss": 0.2154, "step": 2792 }, { "epoch": 4.5024154589371985, "grad_norm": 0.318359375, "learning_rate": 8.632342417596365e-05, "loss": 0.2269, "step": 2796 }, { "epoch": 4.508856682769727, "grad_norm": 0.326171875, "learning_rate": 8.592707016083221e-05, "loss": 0.2134, "step": 2800 }, { "epoch": 4.515297906602254, "grad_norm": 0.359375, "learning_rate": 8.553126259680828e-05, "loss": 0.25, "step": 2804 }, { "epoch": 4.521739130434782, "grad_norm": 0.3359375, "learning_rate": 8.513600485956835e-05, "loss": 0.2262, "step": 2808 }, { "epoch": 4.5281803542673105, "grad_norm": 0.322265625, "learning_rate": 8.474130032009951e-05, "loss": 0.2062, "step": 2812 }, { "epoch": 4.534621578099839, "grad_norm": 0.349609375, "learning_rate": 8.434715234467123e-05, "loss": 0.2293, "step": 2816 }, { "epoch": 4.541062801932367, "grad_norm": 0.337890625, "learning_rate": 8.395356429480587e-05, "loss": 0.204, "step": 2820 }, { "epoch": 4.547504025764895, "grad_norm": 0.326171875, "learning_rate": 8.356053952725072e-05, "loss": 0.2207, "step": 2824 }, { "epoch": 4.553945249597423, "grad_norm": 0.3359375, "learning_rate": 8.316808139394876e-05, "loss": 0.2231, "step": 2828 }, { "epoch": 4.5603864734299515, "grad_norm": 0.34765625, "learning_rate": 8.277619324201081e-05, "loss": 0.2321, "step": 2832 }, { "epoch": 4.56682769726248, "grad_norm": 0.330078125, "learning_rate": 8.238487841368617e-05, "loss": 0.2298, "step": 2836 }, { "epoch": 4.573268921095008, "grad_norm": 0.33203125, "learning_rate": 8.199414024633473e-05, "loss": 0.1997, "step": 2840 }, { "epoch": 4.579710144927536, "grad_norm": 0.3828125, "learning_rate": 8.160398207239805e-05, "loss": 0.2359, "step": 2844 }, { "epoch": 4.586151368760064, "grad_norm": 0.345703125, "learning_rate": 8.121440721937157e-05, "loss": 0.216, "step": 2848 }, { "epoch": 4.592592592592593, "grad_norm": 0.349609375, "learning_rate": 8.082541900977542e-05, "loss": 0.2374, "step": 2852 }, { "epoch": 4.599033816425121, "grad_norm": 0.328125, "learning_rate": 8.04370207611267e-05, "loss": 0.2363, "step": 2856 }, { "epoch": 4.605475040257649, "grad_norm": 0.361328125, "learning_rate": 8.004921578591091e-05, "loss": 0.214, "step": 2860 }, { "epoch": 4.611916264090177, "grad_norm": 0.30859375, "learning_rate": 7.966200739155389e-05, "loss": 0.2214, "step": 2864 }, { "epoch": 4.618357487922705, "grad_norm": 0.337890625, "learning_rate": 7.927539888039339e-05, "loss": 0.2431, "step": 2868 }, { "epoch": 4.624798711755234, "grad_norm": 0.31640625, "learning_rate": 7.888939354965093e-05, "loss": 0.2104, "step": 2872 }, { "epoch": 4.631239935587762, "grad_norm": 0.314453125, "learning_rate": 7.850399469140393e-05, "loss": 0.204, "step": 2876 }, { "epoch": 4.63768115942029, "grad_norm": 0.353515625, "learning_rate": 7.811920559255736e-05, "loss": 0.2263, "step": 2880 }, { "epoch": 4.644122383252818, "grad_norm": 0.3359375, "learning_rate": 7.773502953481585e-05, "loss": 0.2161, "step": 2884 }, { "epoch": 4.650563607085346, "grad_norm": 0.330078125, "learning_rate": 7.73514697946556e-05, "loss": 0.2279, "step": 2888 }, { "epoch": 4.657004830917875, "grad_norm": 0.361328125, "learning_rate": 7.696852964329655e-05, "loss": 0.2615, "step": 2892 }, { "epoch": 4.663446054750403, "grad_norm": 0.33203125, "learning_rate": 7.658621234667443e-05, "loss": 0.2407, "step": 2896 }, { "epoch": 4.669887278582931, "grad_norm": 0.326171875, "learning_rate": 7.620452116541291e-05, "loss": 0.2101, "step": 2900 }, { "epoch": 4.676328502415459, "grad_norm": 0.322265625, "learning_rate": 7.582345935479569e-05, "loss": 0.2191, "step": 2904 }, { "epoch": 4.6827697262479875, "grad_norm": 0.302734375, "learning_rate": 7.544303016473894e-05, "loss": 0.2159, "step": 2908 }, { "epoch": 4.689210950080515, "grad_norm": 0.32421875, "learning_rate": 7.506323683976344e-05, "loss": 0.2251, "step": 2912 }, { "epoch": 4.695652173913043, "grad_norm": 0.328125, "learning_rate": 7.468408261896701e-05, "loss": 0.1935, "step": 2916 }, { "epoch": 4.702093397745571, "grad_norm": 0.341796875, "learning_rate": 7.430557073599662e-05, "loss": 0.2123, "step": 2920 }, { "epoch": 4.708534621578099, "grad_norm": 0.384765625, "learning_rate": 7.392770441902116e-05, "loss": 0.2466, "step": 2924 }, { "epoch": 4.714975845410628, "grad_norm": 0.337890625, "learning_rate": 7.355048689070389e-05, "loss": 0.2332, "step": 2928 }, { "epoch": 4.721417069243156, "grad_norm": 0.349609375, "learning_rate": 7.317392136817453e-05, "loss": 0.2364, "step": 2932 }, { "epoch": 4.727858293075684, "grad_norm": 0.37109375, "learning_rate": 7.279801106300231e-05, "loss": 0.2662, "step": 2936 }, { "epoch": 4.734299516908212, "grad_norm": 0.31640625, "learning_rate": 7.242275918116832e-05, "loss": 0.2174, "step": 2940 }, { "epoch": 4.7407407407407405, "grad_norm": 0.34765625, "learning_rate": 7.204816892303833e-05, "loss": 0.2135, "step": 2944 }, { "epoch": 4.747181964573269, "grad_norm": 0.359375, "learning_rate": 7.16742434833352e-05, "loss": 0.231, "step": 2948 }, { "epoch": 4.753623188405797, "grad_norm": 0.34765625, "learning_rate": 7.1300986051112e-05, "loss": 0.2569, "step": 2952 }, { "epoch": 4.760064412238325, "grad_norm": 0.314453125, "learning_rate": 7.09283998097246e-05, "loss": 0.2072, "step": 2956 }, { "epoch": 4.766505636070853, "grad_norm": 0.32421875, "learning_rate": 7.055648793680466e-05, "loss": 0.2059, "step": 2960 }, { "epoch": 4.7729468599033815, "grad_norm": 0.34765625, "learning_rate": 7.018525360423217e-05, "loss": 0.2429, "step": 2964 }, { "epoch": 4.77938808373591, "grad_norm": 0.330078125, "learning_rate": 6.981469997810892e-05, "loss": 0.2203, "step": 2968 }, { "epoch": 4.785829307568438, "grad_norm": 0.3359375, "learning_rate": 6.944483021873115e-05, "loss": 0.232, "step": 2972 }, { "epoch": 4.792270531400966, "grad_norm": 0.318359375, "learning_rate": 6.907564748056273e-05, "loss": 0.2124, "step": 2976 }, { "epoch": 4.798711755233494, "grad_norm": 0.353515625, "learning_rate": 6.870715491220808e-05, "loss": 0.2184, "step": 2980 }, { "epoch": 4.805152979066023, "grad_norm": 0.31640625, "learning_rate": 6.833935565638559e-05, "loss": 0.238, "step": 2984 }, { "epoch": 4.811594202898551, "grad_norm": 0.333984375, "learning_rate": 6.797225284990064e-05, "loss": 0.2283, "step": 2988 }, { "epoch": 4.818035426731079, "grad_norm": 0.33203125, "learning_rate": 6.760584962361888e-05, "loss": 0.2351, "step": 2992 }, { "epoch": 4.824476650563607, "grad_norm": 0.298828125, "learning_rate": 6.72401491024396e-05, "loss": 0.2019, "step": 2996 }, { "epoch": 4.830917874396135, "grad_norm": 0.373046875, "learning_rate": 6.687515440526882e-05, "loss": 0.242, "step": 3000 }, { "epoch": 4.837359098228664, "grad_norm": 0.341796875, "learning_rate": 6.651086864499305e-05, "loss": 0.2196, "step": 3004 }, { "epoch": 4.843800322061192, "grad_norm": 0.365234375, "learning_rate": 6.614729492845258e-05, "loss": 0.2146, "step": 3008 }, { "epoch": 4.85024154589372, "grad_norm": 0.375, "learning_rate": 6.578443635641497e-05, "loss": 0.2232, "step": 3012 }, { "epoch": 4.856682769726248, "grad_norm": 0.35546875, "learning_rate": 6.542229602354847e-05, "loss": 0.2319, "step": 3016 }, { "epoch": 4.8631239935587764, "grad_norm": 0.353515625, "learning_rate": 6.506087701839593e-05, "loss": 0.2156, "step": 3020 }, { "epoch": 4.869565217391305, "grad_norm": 0.330078125, "learning_rate": 6.470018242334825e-05, "loss": 0.2372, "step": 3024 }, { "epoch": 4.876006441223833, "grad_norm": 0.3203125, "learning_rate": 6.434021531461818e-05, "loss": 0.2077, "step": 3028 }, { "epoch": 4.882447665056361, "grad_norm": 0.345703125, "learning_rate": 6.398097876221385e-05, "loss": 0.2183, "step": 3032 }, { "epoch": 4.888888888888889, "grad_norm": 0.33984375, "learning_rate": 6.362247582991317e-05, "loss": 0.2104, "step": 3036 }, { "epoch": 4.8953301127214175, "grad_norm": 0.33203125, "learning_rate": 6.326470957523686e-05, "loss": 0.2048, "step": 3040 }, { "epoch": 4.901771336553946, "grad_norm": 0.361328125, "learning_rate": 6.29076830494232e-05, "loss": 0.2346, "step": 3044 }, { "epoch": 4.908212560386474, "grad_norm": 0.34765625, "learning_rate": 6.255139929740129e-05, "loss": 0.2068, "step": 3048 }, { "epoch": 4.914653784219001, "grad_norm": 0.353515625, "learning_rate": 6.219586135776575e-05, "loss": 0.239, "step": 3052 }, { "epoch": 4.921095008051529, "grad_norm": 0.30078125, "learning_rate": 6.184107226275038e-05, "loss": 0.1814, "step": 3056 }, { "epoch": 4.927536231884058, "grad_norm": 0.34765625, "learning_rate": 6.148703503820224e-05, "loss": 0.2272, "step": 3060 }, { "epoch": 4.933977455716586, "grad_norm": 0.349609375, "learning_rate": 6.113375270355617e-05, "loss": 0.2418, "step": 3064 }, { "epoch": 4.940418679549114, "grad_norm": 0.357421875, "learning_rate": 6.078122827180879e-05, "loss": 0.2723, "step": 3068 }, { "epoch": 4.946859903381642, "grad_norm": 0.328125, "learning_rate": 6.042946474949302e-05, "loss": 0.2407, "step": 3072 }, { "epoch": 4.9533011272141705, "grad_norm": 0.3203125, "learning_rate": 6.007846513665207e-05, "loss": 0.2153, "step": 3076 }, { "epoch": 4.959742351046699, "grad_norm": 0.330078125, "learning_rate": 5.972823242681426e-05, "loss": 0.2206, "step": 3080 }, { "epoch": 4.966183574879227, "grad_norm": 0.361328125, "learning_rate": 5.937876960696727e-05, "loss": 0.2105, "step": 3084 }, { "epoch": 4.972624798711755, "grad_norm": 0.365234375, "learning_rate": 5.903007965753279e-05, "loss": 0.2526, "step": 3088 }, { "epoch": 4.979066022544283, "grad_norm": 0.310546875, "learning_rate": 5.868216555234081e-05, "loss": 0.2168, "step": 3092 }, { "epoch": 4.9855072463768115, "grad_norm": 0.349609375, "learning_rate": 5.833503025860469e-05, "loss": 0.2174, "step": 3096 }, { "epoch": 4.99194847020934, "grad_norm": 0.322265625, "learning_rate": 5.798867673689553e-05, "loss": 0.2365, "step": 3100 }, { "epoch": 4.998389694041868, "grad_norm": 0.328125, "learning_rate": 5.764310794111711e-05, "loss": 0.1766, "step": 3104 }, { "epoch": 5.004830917874396, "grad_norm": 0.28125, "learning_rate": 5.7298326818480427e-05, "loss": 0.1998, "step": 3108 }, { "epoch": 5.011272141706924, "grad_norm": 0.267578125, "learning_rate": 5.695433630947894e-05, "loss": 0.1838, "step": 3112 }, { "epoch": 5.017713365539453, "grad_norm": 0.328125, "learning_rate": 5.661113934786321e-05, "loss": 0.2045, "step": 3116 }, { "epoch": 5.024154589371981, "grad_norm": 0.28125, "learning_rate": 5.626873886061597e-05, "loss": 0.1917, "step": 3120 }, { "epoch": 5.030595813204509, "grad_norm": 0.314453125, "learning_rate": 5.592713776792723e-05, "loss": 0.204, "step": 3124 }, { "epoch": 5.037037037037037, "grad_norm": 0.2734375, "learning_rate": 5.5586338983169076e-05, "loss": 0.1471, "step": 3128 }, { "epoch": 5.043478260869565, "grad_norm": 0.318359375, "learning_rate": 5.52463454128714e-05, "loss": 0.1966, "step": 3132 }, { "epoch": 5.049919484702094, "grad_norm": 0.2890625, "learning_rate": 5.490715995669641e-05, "loss": 0.1782, "step": 3136 }, { "epoch": 5.056360708534622, "grad_norm": 0.318359375, "learning_rate": 5.456878550741453e-05, "loss": 0.1877, "step": 3140 }, { "epoch": 5.06280193236715, "grad_norm": 0.291015625, "learning_rate": 5.423122495087915e-05, "loss": 0.1643, "step": 3144 }, { "epoch": 5.069243156199678, "grad_norm": 0.298828125, "learning_rate": 5.3894481166002674e-05, "loss": 0.1792, "step": 3148 }, { "epoch": 5.0756843800322065, "grad_norm": 0.310546875, "learning_rate": 5.355855702473125e-05, "loss": 0.1567, "step": 3152 }, { "epoch": 5.082125603864735, "grad_norm": 0.341796875, "learning_rate": 5.322345539202086e-05, "loss": 0.2051, "step": 3156 }, { "epoch": 5.088566827697263, "grad_norm": 0.326171875, "learning_rate": 5.288917912581257e-05, "loss": 0.1754, "step": 3160 }, { "epoch": 5.095008051529791, "grad_norm": 0.2890625, "learning_rate": 5.255573107700832e-05, "loss": 0.1824, "step": 3164 }, { "epoch": 5.101449275362318, "grad_norm": 0.34765625, "learning_rate": 5.222311408944635e-05, "loss": 0.2092, "step": 3168 }, { "epoch": 5.107890499194847, "grad_norm": 0.28515625, "learning_rate": 5.189133099987731e-05, "loss": 0.146, "step": 3172 }, { "epoch": 5.114331723027375, "grad_norm": 0.28515625, "learning_rate": 5.156038463793981e-05, "loss": 0.1692, "step": 3176 }, { "epoch": 5.120772946859903, "grad_norm": 0.291015625, "learning_rate": 5.123027782613636e-05, "loss": 0.1877, "step": 3180 }, { "epoch": 5.127214170692431, "grad_norm": 0.30078125, "learning_rate": 5.09010133798094e-05, "loss": 0.154, "step": 3184 }, { "epoch": 5.1336553945249594, "grad_norm": 0.310546875, "learning_rate": 5.0572594107116974e-05, "loss": 0.1559, "step": 3188 }, { "epoch": 5.140096618357488, "grad_norm": 0.318359375, "learning_rate": 5.0245022809009155e-05, "loss": 0.171, "step": 3192 }, { "epoch": 5.146537842190016, "grad_norm": 0.294921875, "learning_rate": 4.991830227920398e-05, "loss": 0.1774, "step": 3196 }, { "epoch": 5.152979066022544, "grad_norm": 0.283203125, "learning_rate": 4.9592435304163675e-05, "loss": 0.1813, "step": 3200 }, { "epoch": 5.159420289855072, "grad_norm": 0.302734375, "learning_rate": 4.926742466307069e-05, "loss": 0.1557, "step": 3204 }, { "epoch": 5.1658615136876005, "grad_norm": 0.283203125, "learning_rate": 4.8943273127804345e-05, "loss": 0.1574, "step": 3208 }, { "epoch": 5.172302737520129, "grad_norm": 0.306640625, "learning_rate": 4.8619983462916935e-05, "loss": 0.1548, "step": 3212 }, { "epoch": 5.178743961352657, "grad_norm": 0.318359375, "learning_rate": 4.829755842561025e-05, "loss": 0.1888, "step": 3216 }, { "epoch": 5.185185185185185, "grad_norm": 0.2734375, "learning_rate": 4.797600076571194e-05, "loss": 0.2004, "step": 3220 }, { "epoch": 5.191626409017713, "grad_norm": 0.2890625, "learning_rate": 4.7655313225652294e-05, "loss": 0.1587, "step": 3224 }, { "epoch": 5.1980676328502415, "grad_norm": 0.296875, "learning_rate": 4.7335498540440606e-05, "loss": 0.1669, "step": 3228 }, { "epoch": 5.20450885668277, "grad_norm": 0.3359375, "learning_rate": 4.7016559437642084e-05, "loss": 0.171, "step": 3232 }, { "epoch": 5.210950080515298, "grad_norm": 0.30859375, "learning_rate": 4.6698498637354225e-05, "loss": 0.1566, "step": 3236 }, { "epoch": 5.217391304347826, "grad_norm": 0.294921875, "learning_rate": 4.6381318852184194e-05, "loss": 0.1936, "step": 3240 }, { "epoch": 5.223832528180354, "grad_norm": 0.3359375, "learning_rate": 4.606502278722503e-05, "loss": 0.1897, "step": 3244 }, { "epoch": 5.230273752012883, "grad_norm": 0.34375, "learning_rate": 4.574961314003304e-05, "loss": 0.1935, "step": 3248 }, { "epoch": 5.236714975845411, "grad_norm": 0.291015625, "learning_rate": 4.5435092600604676e-05, "loss": 0.159, "step": 3252 }, { "epoch": 5.243156199677939, "grad_norm": 0.296875, "learning_rate": 4.5121463851353476e-05, "loss": 0.2065, "step": 3256 }, { "epoch": 5.249597423510467, "grad_norm": 0.322265625, "learning_rate": 4.48087295670874e-05, "loss": 0.1993, "step": 3260 }, { "epoch": 5.256038647342995, "grad_norm": 0.326171875, "learning_rate": 4.449689241498569e-05, "loss": 0.1717, "step": 3264 }, { "epoch": 5.262479871175524, "grad_norm": 0.306640625, "learning_rate": 4.41859550545765e-05, "loss": 0.1907, "step": 3268 }, { "epoch": 5.268921095008052, "grad_norm": 0.287109375, "learning_rate": 4.387592013771396e-05, "loss": 0.1691, "step": 3272 }, { "epoch": 5.27536231884058, "grad_norm": 0.298828125, "learning_rate": 4.356679030855573e-05, "loss": 0.198, "step": 3276 }, { "epoch": 5.281803542673108, "grad_norm": 0.314453125, "learning_rate": 4.32585682035402e-05, "loss": 0.2026, "step": 3280 }, { "epoch": 5.2882447665056365, "grad_norm": 0.326171875, "learning_rate": 4.2951256451364264e-05, "loss": 0.1975, "step": 3284 }, { "epoch": 5.294685990338165, "grad_norm": 0.283203125, "learning_rate": 4.264485767296081e-05, "loss": 0.1686, "step": 3288 }, { "epoch": 5.301127214170693, "grad_norm": 0.310546875, "learning_rate": 4.233937448147635e-05, "loss": 0.1583, "step": 3292 }, { "epoch": 5.30756843800322, "grad_norm": 0.296875, "learning_rate": 4.203480948224866e-05, "loss": 0.1777, "step": 3296 }, { "epoch": 5.314009661835748, "grad_norm": 0.322265625, "learning_rate": 4.173116527278471e-05, "loss": 0.1616, "step": 3300 }, { "epoch": 5.320450885668277, "grad_norm": 0.31640625, "learning_rate": 4.142844444273845e-05, "loss": 0.1731, "step": 3304 }, { "epoch": 5.326892109500805, "grad_norm": 0.349609375, "learning_rate": 4.1126649573888696e-05, "loss": 0.2219, "step": 3308 }, { "epoch": 5.333333333333333, "grad_norm": 0.298828125, "learning_rate": 4.082578324011716e-05, "loss": 0.1937, "step": 3312 }, { "epoch": 5.339774557165861, "grad_norm": 0.33203125, "learning_rate": 4.052584800738636e-05, "loss": 0.1891, "step": 3316 }, { "epoch": 5.3462157809983895, "grad_norm": 0.306640625, "learning_rate": 4.0226846433717954e-05, "loss": 0.1811, "step": 3320 }, { "epoch": 5.352657004830918, "grad_norm": 0.279296875, "learning_rate": 3.992878106917079e-05, "loss": 0.1768, "step": 3324 }, { "epoch": 5.359098228663446, "grad_norm": 0.318359375, "learning_rate": 3.963165445581922e-05, "loss": 0.1985, "step": 3328 }, { "epoch": 5.365539452495974, "grad_norm": 0.318359375, "learning_rate": 3.933546912773119e-05, "loss": 0.154, "step": 3332 }, { "epoch": 5.371980676328502, "grad_norm": 0.2734375, "learning_rate": 3.904022761094715e-05, "loss": 0.1788, "step": 3336 }, { "epoch": 5.3784219001610305, "grad_norm": 0.3125, "learning_rate": 3.874593242345785e-05, "loss": 0.1964, "step": 3340 }, { "epoch": 5.384863123993559, "grad_norm": 0.306640625, "learning_rate": 3.845258607518344e-05, "loss": 0.1822, "step": 3344 }, { "epoch": 5.391304347826087, "grad_norm": 0.306640625, "learning_rate": 3.816019106795157e-05, "loss": 0.1711, "step": 3348 }, { "epoch": 5.397745571658615, "grad_norm": 0.283203125, "learning_rate": 3.7868749895476624e-05, "loss": 0.1785, "step": 3352 }, { "epoch": 5.404186795491143, "grad_norm": 0.361328125, "learning_rate": 3.7578265043337834e-05, "loss": 0.1891, "step": 3356 }, { "epoch": 5.4106280193236715, "grad_norm": 0.29296875, "learning_rate": 3.72887389889586e-05, "loss": 0.1766, "step": 3360 }, { "epoch": 5.4170692431562, "grad_norm": 0.322265625, "learning_rate": 3.700017420158486e-05, "loss": 0.1733, "step": 3364 }, { "epoch": 5.423510466988728, "grad_norm": 0.287109375, "learning_rate": 3.671257314226471e-05, "loss": 0.1895, "step": 3368 }, { "epoch": 5.429951690821256, "grad_norm": 0.287109375, "learning_rate": 3.642593826382663e-05, "loss": 0.1867, "step": 3372 }, { "epoch": 5.436392914653784, "grad_norm": 0.33984375, "learning_rate": 3.6140272010859166e-05, "loss": 0.1946, "step": 3376 }, { "epoch": 5.442834138486313, "grad_norm": 0.314453125, "learning_rate": 3.585557681968979e-05, "loss": 0.1684, "step": 3380 }, { "epoch": 5.449275362318841, "grad_norm": 0.298828125, "learning_rate": 3.5571855118364236e-05, "loss": 0.1886, "step": 3384 }, { "epoch": 5.455716586151369, "grad_norm": 0.314453125, "learning_rate": 3.528910932662577e-05, "loss": 0.199, "step": 3388 }, { "epoch": 5.462157809983897, "grad_norm": 0.291015625, "learning_rate": 3.5007341855894394e-05, "loss": 0.1877, "step": 3392 }, { "epoch": 5.468599033816425, "grad_norm": 0.337890625, "learning_rate": 3.472655510924656e-05, "loss": 0.1856, "step": 3396 }, { "epoch": 5.475040257648954, "grad_norm": 0.3203125, "learning_rate": 3.4446751481394516e-05, "loss": 0.1818, "step": 3400 }, { "epoch": 5.481481481481482, "grad_norm": 0.34375, "learning_rate": 3.4167933358665936e-05, "loss": 0.2009, "step": 3404 }, { "epoch": 5.48792270531401, "grad_norm": 0.306640625, "learning_rate": 3.3890103118983366e-05, "loss": 0.1824, "step": 3408 }, { "epoch": 5.494363929146537, "grad_norm": 0.34375, "learning_rate": 3.3613263131844294e-05, "loss": 0.1746, "step": 3412 }, { "epoch": 5.500805152979066, "grad_norm": 0.30078125, "learning_rate": 3.333741575830069e-05, "loss": 0.1769, "step": 3416 }, { "epoch": 5.507246376811594, "grad_norm": 0.341796875, "learning_rate": 3.306256335093898e-05, "loss": 0.1822, "step": 3420 }, { "epoch": 5.513687600644122, "grad_norm": 0.302734375, "learning_rate": 3.278870825385983e-05, "loss": 0.1925, "step": 3424 }, { "epoch": 5.52012882447665, "grad_norm": 0.328125, "learning_rate": 3.251585280265839e-05, "loss": 0.1923, "step": 3428 }, { "epoch": 5.526570048309178, "grad_norm": 0.3125, "learning_rate": 3.224399932440419e-05, "loss": 0.1815, "step": 3432 }, { "epoch": 5.533011272141707, "grad_norm": 0.318359375, "learning_rate": 3.1973150137621364e-05, "loss": 0.1738, "step": 3436 }, { "epoch": 5.539452495974235, "grad_norm": 0.296875, "learning_rate": 3.170330755226893e-05, "loss": 0.191, "step": 3440 }, { "epoch": 5.545893719806763, "grad_norm": 0.30078125, "learning_rate": 3.1434473869720804e-05, "loss": 0.1538, "step": 3444 }, { "epoch": 5.552334943639291, "grad_norm": 0.306640625, "learning_rate": 3.116665138274676e-05, "loss": 0.1748, "step": 3448 }, { "epoch": 5.5587761674718195, "grad_norm": 0.330078125, "learning_rate": 3.0899842375492145e-05, "loss": 0.1893, "step": 3452 }, { "epoch": 5.565217391304348, "grad_norm": 0.328125, "learning_rate": 3.063404912345897e-05, "loss": 0.1727, "step": 3456 }, { "epoch": 5.571658615136876, "grad_norm": 0.31640625, "learning_rate": 3.036927389348625e-05, "loss": 0.1804, "step": 3460 }, { "epoch": 5.578099838969404, "grad_norm": 0.283203125, "learning_rate": 3.010551894373075e-05, "loss": 0.1778, "step": 3464 }, { "epoch": 5.584541062801932, "grad_norm": 0.3125, "learning_rate": 2.9842786523647582e-05, "loss": 0.1679, "step": 3468 }, { "epoch": 5.5909822866344605, "grad_norm": 0.291015625, "learning_rate": 2.9581078873971248e-05, "loss": 0.1812, "step": 3472 }, { "epoch": 5.597423510466989, "grad_norm": 0.31640625, "learning_rate": 2.9320398226696367e-05, "loss": 0.188, "step": 3476 }, { "epoch": 5.603864734299517, "grad_norm": 0.31640625, "learning_rate": 2.9060746805058738e-05, "loss": 0.1541, "step": 3480 }, { "epoch": 5.610305958132045, "grad_norm": 0.34375, "learning_rate": 2.8802126823516193e-05, "loss": 0.1671, "step": 3484 }, { "epoch": 5.616747181964573, "grad_norm": 0.30078125, "learning_rate": 2.8544540487729984e-05, "loss": 0.1609, "step": 3488 }, { "epoch": 5.6231884057971016, "grad_norm": 0.326171875, "learning_rate": 2.828798999454577e-05, "loss": 0.1488, "step": 3492 }, { "epoch": 5.62962962962963, "grad_norm": 0.326171875, "learning_rate": 2.8032477531974984e-05, "loss": 0.2012, "step": 3496 }, { "epoch": 5.636070853462158, "grad_norm": 0.3203125, "learning_rate": 2.7778005279176053e-05, "loss": 0.208, "step": 3500 }, { "epoch": 5.642512077294686, "grad_norm": 0.328125, "learning_rate": 2.7524575406435955e-05, "loss": 0.192, "step": 3504 }, { "epoch": 5.648953301127214, "grad_norm": 0.302734375, "learning_rate": 2.7272190075151655e-05, "loss": 0.1582, "step": 3508 }, { "epoch": 5.655394524959743, "grad_norm": 0.3203125, "learning_rate": 2.7020851437811608e-05, "loss": 0.1762, "step": 3512 }, { "epoch": 5.661835748792271, "grad_norm": 0.30859375, "learning_rate": 2.6770561637977556e-05, "loss": 0.1678, "step": 3516 }, { "epoch": 5.668276972624799, "grad_norm": 0.294921875, "learning_rate": 2.652132281026598e-05, "loss": 0.1822, "step": 3520 }, { "epoch": 5.674718196457327, "grad_norm": 0.30859375, "learning_rate": 2.6273137080330225e-05, "loss": 0.183, "step": 3524 }, { "epoch": 5.681159420289855, "grad_norm": 0.3125, "learning_rate": 2.6026006564842106e-05, "loss": 0.2009, "step": 3528 }, { "epoch": 5.687600644122384, "grad_norm": 0.31640625, "learning_rate": 2.577993337147406e-05, "loss": 0.1858, "step": 3532 }, { "epoch": 5.694041867954912, "grad_norm": 0.337890625, "learning_rate": 2.5534919598880887e-05, "loss": 0.203, "step": 3536 }, { "epoch": 5.70048309178744, "grad_norm": 0.294921875, "learning_rate": 2.5290967336682266e-05, "loss": 0.1588, "step": 3540 }, { "epoch": 5.706924315619968, "grad_norm": 0.73828125, "learning_rate": 2.5048078665444497e-05, "loss": 0.1622, "step": 3544 }, { "epoch": 5.713365539452496, "grad_norm": 0.33203125, "learning_rate": 2.4806255656663092e-05, "loss": 0.185, "step": 3548 }, { "epoch": 5.719806763285024, "grad_norm": 0.30078125, "learning_rate": 2.4565500372744845e-05, "loss": 0.1904, "step": 3552 }, { "epoch": 5.726247987117552, "grad_norm": 0.314453125, "learning_rate": 2.4325814866990583e-05, "loss": 0.175, "step": 3556 }, { "epoch": 5.73268921095008, "grad_norm": 0.30859375, "learning_rate": 2.4087201183577205e-05, "loss": 0.1699, "step": 3560 }, { "epoch": 5.739130434782608, "grad_norm": 0.32421875, "learning_rate": 2.384966135754063e-05, "loss": 0.1823, "step": 3564 }, { "epoch": 5.745571658615137, "grad_norm": 0.33203125, "learning_rate": 2.3613197414758273e-05, "loss": 0.1788, "step": 3568 }, { "epoch": 5.752012882447665, "grad_norm": 0.3125, "learning_rate": 2.3377811371931793e-05, "loss": 0.1794, "step": 3572 }, { "epoch": 5.758454106280193, "grad_norm": 0.3125, "learning_rate": 2.3143505236569915e-05, "loss": 0.1684, "step": 3576 }, { "epoch": 5.764895330112721, "grad_norm": 0.2890625, "learning_rate": 2.2910281006971164e-05, "loss": 0.157, "step": 3580 }, { "epoch": 5.7713365539452495, "grad_norm": 0.3359375, "learning_rate": 2.26781406722071e-05, "loss": 0.1833, "step": 3584 }, { "epoch": 5.777777777777778, "grad_norm": 0.330078125, "learning_rate": 2.2447086212105143e-05, "loss": 0.1945, "step": 3588 }, { "epoch": 5.784219001610306, "grad_norm": 0.310546875, "learning_rate": 2.2217119597231747e-05, "loss": 0.1801, "step": 3592 }, { "epoch": 5.790660225442834, "grad_norm": 0.30859375, "learning_rate": 2.1988242788875532e-05, "loss": 0.1735, "step": 3596 }, { "epoch": 5.797101449275362, "grad_norm": 0.302734375, "learning_rate": 2.1760457739030695e-05, "loss": 0.1755, "step": 3600 }, { "epoch": 5.8035426731078905, "grad_norm": 0.318359375, "learning_rate": 2.1533766390380254e-05, "loss": 0.1674, "step": 3604 }, { "epoch": 5.809983896940419, "grad_norm": 0.302734375, "learning_rate": 2.1308170676279547e-05, "loss": 0.138, "step": 3608 }, { "epoch": 5.816425120772947, "grad_norm": 0.3046875, "learning_rate": 2.108367252073961e-05, "loss": 0.1521, "step": 3612 }, { "epoch": 5.822866344605475, "grad_norm": 0.310546875, "learning_rate": 2.0860273838410928e-05, "loss": 0.1771, "step": 3616 }, { "epoch": 5.829307568438003, "grad_norm": 0.328125, "learning_rate": 2.0637976534567046e-05, "loss": 0.1628, "step": 3620 }, { "epoch": 5.835748792270532, "grad_norm": 0.298828125, "learning_rate": 2.0416782505088347e-05, "loss": 0.1631, "step": 3624 }, { "epoch": 5.84219001610306, "grad_norm": 0.271484375, "learning_rate": 2.0196693636445727e-05, "loss": 0.1731, "step": 3628 }, { "epoch": 5.848631239935588, "grad_norm": 0.310546875, "learning_rate": 1.9977711805684706e-05, "loss": 0.1748, "step": 3632 }, { "epoch": 5.855072463768116, "grad_norm": 0.34375, "learning_rate": 1.975983888040945e-05, "loss": 0.1872, "step": 3636 }, { "epoch": 5.861513687600644, "grad_norm": 0.328125, "learning_rate": 1.9543076718766538e-05, "loss": 0.1883, "step": 3640 }, { "epoch": 5.867954911433173, "grad_norm": 0.322265625, "learning_rate": 1.932742716942946e-05, "loss": 0.1543, "step": 3644 }, { "epoch": 5.874396135265701, "grad_norm": 0.314453125, "learning_rate": 1.911289207158254e-05, "loss": 0.1807, "step": 3648 }, { "epoch": 5.880837359098229, "grad_norm": 0.333984375, "learning_rate": 1.8899473254905672e-05, "loss": 0.1775, "step": 3652 }, { "epoch": 5.887278582930757, "grad_norm": 0.314453125, "learning_rate": 1.8687172539558208e-05, "loss": 0.1767, "step": 3656 }, { "epoch": 5.8937198067632846, "grad_norm": 0.306640625, "learning_rate": 1.8475991736163835e-05, "loss": 0.1662, "step": 3660 }, { "epoch": 5.900161030595813, "grad_norm": 0.294921875, "learning_rate": 1.8265932645794827e-05, "loss": 0.1575, "step": 3664 }, { "epoch": 5.906602254428341, "grad_norm": 0.359375, "learning_rate": 1.805699705995708e-05, "loss": 0.1778, "step": 3668 }, { "epoch": 5.913043478260869, "grad_norm": 0.322265625, "learning_rate": 1.7849186760574346e-05, "loss": 0.1661, "step": 3672 }, { "epoch": 5.919484702093397, "grad_norm": 0.30078125, "learning_rate": 1.7642503519973432e-05, "loss": 0.1603, "step": 3676 }, { "epoch": 5.925925925925926, "grad_norm": 0.302734375, "learning_rate": 1.7436949100868864e-05, "loss": 0.1603, "step": 3680 }, { "epoch": 5.932367149758454, "grad_norm": 0.326171875, "learning_rate": 1.7232525256348013e-05, "loss": 0.1907, "step": 3684 }, { "epoch": 5.938808373590982, "grad_norm": 0.333984375, "learning_rate": 1.7029233729855883e-05, "loss": 0.1848, "step": 3688 }, { "epoch": 5.94524959742351, "grad_norm": 0.30859375, "learning_rate": 1.6827076255180593e-05, "loss": 0.1719, "step": 3692 }, { "epoch": 5.951690821256038, "grad_norm": 0.296875, "learning_rate": 1.6626054556438322e-05, "loss": 0.1819, "step": 3696 }, { "epoch": 5.958132045088567, "grad_norm": 0.31640625, "learning_rate": 1.6426170348058703e-05, "loss": 0.1669, "step": 3700 }, { "epoch": 5.964573268921095, "grad_norm": 0.31640625, "learning_rate": 1.6227425334770245e-05, "loss": 0.169, "step": 3704 }, { "epoch": 5.971014492753623, "grad_norm": 0.333984375, "learning_rate": 1.6029821211585592e-05, "loss": 0.1723, "step": 3708 }, { "epoch": 5.977455716586151, "grad_norm": 0.337890625, "learning_rate": 1.5833359663787392e-05, "loss": 0.2008, "step": 3712 }, { "epoch": 5.9838969404186795, "grad_norm": 0.275390625, "learning_rate": 1.563804236691364e-05, "loss": 0.1523, "step": 3716 }, { "epoch": 5.990338164251208, "grad_norm": 0.271484375, "learning_rate": 1.5443870986743562e-05, "loss": 0.1592, "step": 3720 }, { "epoch": 5.996779388083736, "grad_norm": 0.337890625, "learning_rate": 1.5250847179283243e-05, "loss": 0.2154, "step": 3724 }, { "epoch": 6.003220611916264, "grad_norm": 0.30859375, "learning_rate": 1.505897259075171e-05, "loss": 0.1917, "step": 3728 }, { "epoch": 6.009661835748792, "grad_norm": 0.287109375, "learning_rate": 1.4868248857566734e-05, "loss": 0.1512, "step": 3732 }, { "epoch": 6.0161030595813205, "grad_norm": 0.310546875, "learning_rate": 1.4678677606330964e-05, "loss": 0.1889, "step": 3736 }, { "epoch": 6.022544283413849, "grad_norm": 0.2890625, "learning_rate": 1.4490260453817898e-05, "loss": 0.1694, "step": 3740 }, { "epoch": 6.028985507246377, "grad_norm": 0.283203125, "learning_rate": 1.4302999006958342e-05, "loss": 0.1365, "step": 3744 }, { "epoch": 6.035426731078905, "grad_norm": 0.267578125, "learning_rate": 1.411689486282654e-05, "loss": 0.148, "step": 3748 }, { "epoch": 6.041867954911433, "grad_norm": 0.30859375, "learning_rate": 1.393194960862657e-05, "loss": 0.1744, "step": 3752 }, { "epoch": 6.048309178743962, "grad_norm": 0.28515625, "learning_rate": 1.3748164821678759e-05, "loss": 0.1642, "step": 3756 }, { "epoch": 6.05475040257649, "grad_norm": 0.310546875, "learning_rate": 1.3565542069406433e-05, "loss": 0.1826, "step": 3760 }, { "epoch": 6.061191626409018, "grad_norm": 0.314453125, "learning_rate": 1.3384082909322375e-05, "loss": 0.1911, "step": 3764 }, { "epoch": 6.067632850241546, "grad_norm": 0.283203125, "learning_rate": 1.320378888901546e-05, "loss": 0.134, "step": 3768 }, { "epoch": 6.074074074074074, "grad_norm": 0.3125, "learning_rate": 1.3024661546137694e-05, "loss": 0.1778, "step": 3772 }, { "epoch": 6.080515297906603, "grad_norm": 0.271484375, "learning_rate": 1.2846702408390975e-05, "loss": 0.1542, "step": 3776 }, { "epoch": 6.086956521739131, "grad_norm": 0.275390625, "learning_rate": 1.2669912993514036e-05, "loss": 0.185, "step": 3780 }, { "epoch": 6.093397745571659, "grad_norm": 0.326171875, "learning_rate": 1.2494294809269512e-05, "loss": 0.1937, "step": 3784 }, { "epoch": 6.099838969404187, "grad_norm": 0.296875, "learning_rate": 1.2319849353431154e-05, "loss": 0.1642, "step": 3788 }, { "epoch": 6.106280193236715, "grad_norm": 0.28515625, "learning_rate": 1.2146578113771005e-05, "loss": 0.156, "step": 3792 }, { "epoch": 6.112721417069243, "grad_norm": 0.28515625, "learning_rate": 1.1974482568046694e-05, "loss": 0.172, "step": 3796 }, { "epoch": 6.119162640901771, "grad_norm": 0.302734375, "learning_rate": 1.1803564183988812e-05, "loss": 0.1655, "step": 3800 }, { "epoch": 6.125603864734299, "grad_norm": 0.294921875, "learning_rate": 1.1633824419288474e-05, "loss": 0.1741, "step": 3804 }, { "epoch": 6.132045088566827, "grad_norm": 0.3125, "learning_rate": 1.146526472158487e-05, "loss": 0.1805, "step": 3808 }, { "epoch": 6.138486312399356, "grad_norm": 0.298828125, "learning_rate": 1.1297886528452882e-05, "loss": 0.1617, "step": 3812 }, { "epoch": 6.144927536231884, "grad_norm": 0.291015625, "learning_rate": 1.1131691267390757e-05, "loss": 0.1863, "step": 3816 }, { "epoch": 6.151368760064412, "grad_norm": 0.33984375, "learning_rate": 1.0966680355808122e-05, "loss": 0.2013, "step": 3820 }, { "epoch": 6.15780998389694, "grad_norm": 0.2734375, "learning_rate": 1.080285520101371e-05, "loss": 0.1683, "step": 3824 }, { "epoch": 6.164251207729468, "grad_norm": 0.287109375, "learning_rate": 1.0640217200203466e-05, "loss": 0.1729, "step": 3828 }, { "epoch": 6.170692431561997, "grad_norm": 0.30859375, "learning_rate": 1.047876774044863e-05, "loss": 0.1736, "step": 3832 }, { "epoch": 6.177133655394525, "grad_norm": 0.302734375, "learning_rate": 1.0318508198683734e-05, "loss": 0.1757, "step": 3836 }, { "epoch": 6.183574879227053, "grad_norm": 0.275390625, "learning_rate": 1.015943994169523e-05, "loss": 0.1824, "step": 3840 }, { "epoch": 6.190016103059581, "grad_norm": 0.28125, "learning_rate": 1.0001564326109363e-05, "loss": 0.1536, "step": 3844 }, { "epoch": 6.1964573268921095, "grad_norm": 0.314453125, "learning_rate": 9.844882698381013e-06, "loss": 0.2085, "step": 3848 }, { "epoch": 6.202898550724638, "grad_norm": 0.2890625, "learning_rate": 9.689396394781923e-06, "loss": 0.1665, "step": 3852 }, { "epoch": 6.209339774557166, "grad_norm": 0.322265625, "learning_rate": 9.535106741389542e-06, "loss": 0.1714, "step": 3856 }, { "epoch": 6.215780998389694, "grad_norm": 0.3046875, "learning_rate": 9.382015054075465e-06, "loss": 0.1639, "step": 3860 }, { "epoch": 6.222222222222222, "grad_norm": 0.265625, "learning_rate": 9.230122638494408e-06, "loss": 0.1391, "step": 3864 }, { "epoch": 6.2286634460547505, "grad_norm": 0.30078125, "learning_rate": 9.079430790072972e-06, "loss": 0.1514, "step": 3868 }, { "epoch": 6.235104669887279, "grad_norm": 0.26953125, "learning_rate": 8.92994079399868e-06, "loss": 0.1324, "step": 3872 }, { "epoch": 6.241545893719807, "grad_norm": 0.306640625, "learning_rate": 8.781653925208887e-06, "loss": 0.1629, "step": 3876 }, { "epoch": 6.247987117552335, "grad_norm": 0.330078125, "learning_rate": 8.634571448380056e-06, "loss": 0.1899, "step": 3880 }, { "epoch": 6.254428341384863, "grad_norm": 0.3046875, "learning_rate": 8.488694617916785e-06, "loss": 0.1696, "step": 3884 }, { "epoch": 6.260869565217392, "grad_norm": 0.296875, "learning_rate": 8.344024677941346e-06, "loss": 0.1652, "step": 3888 }, { "epoch": 6.26731078904992, "grad_norm": 0.28515625, "learning_rate": 8.200562862282912e-06, "loss": 0.1642, "step": 3892 }, { "epoch": 6.273752012882448, "grad_norm": 0.30078125, "learning_rate": 8.058310394466994e-06, "loss": 0.1458, "step": 3896 }, { "epoch": 6.280193236714976, "grad_norm": 0.3203125, "learning_rate": 7.917268487705175e-06, "loss": 0.1519, "step": 3900 }, { "epoch": 6.286634460547504, "grad_norm": 0.287109375, "learning_rate": 7.777438344884645e-06, "loss": 0.1745, "step": 3904 }, { "epoch": 6.293075684380033, "grad_norm": 0.265625, "learning_rate": 7.638821158557962e-06, "loss": 0.1696, "step": 3908 }, { "epoch": 6.29951690821256, "grad_norm": 0.255859375, "learning_rate": 7.501418110932872e-06, "loss": 0.1634, "step": 3912 }, { "epoch": 6.305958132045088, "grad_norm": 0.279296875, "learning_rate": 7.365230373862274e-06, "loss": 0.1589, "step": 3916 }, { "epoch": 6.312399355877616, "grad_norm": 0.3046875, "learning_rate": 7.2302591088341576e-06, "loss": 0.1675, "step": 3920 }, { "epoch": 6.318840579710145, "grad_norm": 0.298828125, "learning_rate": 7.096505466961794e-06, "loss": 0.1718, "step": 3924 }, { "epoch": 6.325281803542673, "grad_norm": 0.271484375, "learning_rate": 6.963970588973761e-06, "loss": 0.1412, "step": 3928 }, { "epoch": 6.331723027375201, "grad_norm": 0.302734375, "learning_rate": 6.832655605204401e-06, "loss": 0.165, "step": 3932 }, { "epoch": 6.338164251207729, "grad_norm": 0.314453125, "learning_rate": 6.702561635584047e-06, "loss": 0.1496, "step": 3936 }, { "epoch": 6.344605475040257, "grad_norm": 0.30859375, "learning_rate": 6.57368978962956e-06, "loss": 0.1829, "step": 3940 }, { "epoch": 6.351046698872786, "grad_norm": 0.306640625, "learning_rate": 6.44604116643474e-06, "loss": 0.182, "step": 3944 }, { "epoch": 6.357487922705314, "grad_norm": 0.279296875, "learning_rate": 6.3196168546610634e-06, "loss": 0.1714, "step": 3948 }, { "epoch": 6.363929146537842, "grad_norm": 0.35546875, "learning_rate": 6.194417932528478e-06, "loss": 0.2102, "step": 3952 }, { "epoch": 6.37037037037037, "grad_norm": 0.306640625, "learning_rate": 6.070445467805923e-06, "loss": 0.1761, "step": 3956 }, { "epoch": 6.3768115942028984, "grad_norm": 0.30859375, "learning_rate": 5.947700517802523e-06, "loss": 0.1816, "step": 3960 }, { "epoch": 6.383252818035427, "grad_norm": 0.287109375, "learning_rate": 5.826184129358358e-06, "loss": 0.2008, "step": 3964 }, { "epoch": 6.389694041867955, "grad_norm": 0.296875, "learning_rate": 5.705897338835724e-06, "loss": 0.1857, "step": 3968 }, { "epoch": 6.396135265700483, "grad_norm": 0.2890625, "learning_rate": 5.58684117211009e-06, "loss": 0.1681, "step": 3972 }, { "epoch": 6.402576489533011, "grad_norm": 0.314453125, "learning_rate": 5.469016644561519e-06, "loss": 0.1409, "step": 3976 }, { "epoch": 6.4090177133655395, "grad_norm": 0.3046875, "learning_rate": 5.352424761065926e-06, "loss": 0.1647, "step": 3980 }, { "epoch": 6.415458937198068, "grad_norm": 0.310546875, "learning_rate": 5.2370665159865045e-06, "loss": 0.1556, "step": 3984 }, { "epoch": 6.421900161030596, "grad_norm": 0.33203125, "learning_rate": 5.1229428931652775e-06, "loss": 0.1912, "step": 3988 }, { "epoch": 6.428341384863124, "grad_norm": 0.275390625, "learning_rate": 5.010054865914676e-06, "loss": 0.1521, "step": 3992 }, { "epoch": 6.434782608695652, "grad_norm": 0.29296875, "learning_rate": 4.898403397009293e-06, "loss": 0.192, "step": 3996 }, { "epoch": 6.4412238325281805, "grad_norm": 0.28515625, "learning_rate": 4.787989438677625e-06, "loss": 0.1464, "step": 4000 }, { "epoch": 6.447665056360709, "grad_norm": 0.251953125, "learning_rate": 4.678813932593911e-06, "loss": 0.1608, "step": 4004 }, { "epoch": 6.454106280193237, "grad_norm": 0.29296875, "learning_rate": 4.570877809870188e-06, "loss": 0.1778, "step": 4008 }, { "epoch": 6.460547504025765, "grad_norm": 0.30078125, "learning_rate": 4.464181991048349e-06, "loss": 0.1657, "step": 4012 }, { "epoch": 6.466988727858293, "grad_norm": 0.30078125, "learning_rate": 4.358727386092198e-06, "loss": 0.1707, "step": 4016 }, { "epoch": 6.473429951690822, "grad_norm": 0.26953125, "learning_rate": 4.254514894379774e-06, "loss": 0.1641, "step": 4020 }, { "epoch": 6.47987117552335, "grad_norm": 0.3046875, "learning_rate": 4.1515454046956384e-06, "loss": 0.1767, "step": 4024 }, { "epoch": 6.486312399355878, "grad_norm": 0.28515625, "learning_rate": 4.049819795223336e-06, "loss": 0.1914, "step": 4028 }, { "epoch": 6.492753623188406, "grad_norm": 0.30078125, "learning_rate": 3.949338933537843e-06, "loss": 0.1588, "step": 4032 }, { "epoch": 6.499194847020934, "grad_norm": 0.314453125, "learning_rate": 3.850103676598265e-06, "loss": 0.1517, "step": 4036 }, { "epoch": 6.505636070853463, "grad_norm": 0.30859375, "learning_rate": 3.752114870740386e-06, "loss": 0.1713, "step": 4040 }, { "epoch": 6.512077294685991, "grad_norm": 0.2734375, "learning_rate": 3.6553733516695937e-06, "loss": 0.1704, "step": 4044 }, { "epoch": 6.518518518518518, "grad_norm": 0.32421875, "learning_rate": 3.5598799444536697e-06, "loss": 0.1545, "step": 4048 }, { "epoch": 6.524959742351046, "grad_norm": 0.31640625, "learning_rate": 3.465635463515792e-06, "loss": 0.1684, "step": 4052 }, { "epoch": 6.531400966183575, "grad_norm": 0.330078125, "learning_rate": 3.3726407126275112e-06, "loss": 0.1472, "step": 4056 }, { "epoch": 6.537842190016103, "grad_norm": 0.2890625, "learning_rate": 3.2808964849020513e-06, "loss": 0.1268, "step": 4060 }, { "epoch": 6.544283413848631, "grad_norm": 0.3046875, "learning_rate": 3.190403562787369e-06, "loss": 0.1694, "step": 4064 }, { "epoch": 6.550724637681159, "grad_norm": 0.3125, "learning_rate": 3.1011627180596075e-06, "loss": 0.1688, "step": 4068 }, { "epoch": 6.557165861513687, "grad_norm": 0.314453125, "learning_rate": 3.0131747118164018e-06, "loss": 0.1946, "step": 4072 }, { "epoch": 6.563607085346216, "grad_norm": 0.310546875, "learning_rate": 2.9264402944705665e-06, "loss": 0.1714, "step": 4076 }, { "epoch": 6.570048309178744, "grad_norm": 0.28515625, "learning_rate": 2.8409602057434865e-06, "loss": 0.1467, "step": 4080 }, { "epoch": 6.576489533011272, "grad_norm": 0.298828125, "learning_rate": 2.7567351746589363e-06, "loss": 0.1615, "step": 4084 }, { "epoch": 6.5829307568438, "grad_norm": 0.322265625, "learning_rate": 2.6737659195368354e-06, "loss": 0.1674, "step": 4088 }, { "epoch": 6.5893719806763285, "grad_norm": 0.287109375, "learning_rate": 2.592053147987105e-06, "loss": 0.1651, "step": 4092 }, { "epoch": 6.595813204508857, "grad_norm": 0.265625, "learning_rate": 2.5115975569036718e-06, "loss": 0.1263, "step": 4096 }, { "epoch": 6.602254428341385, "grad_norm": 0.26953125, "learning_rate": 2.4323998324584536e-06, "loss": 0.148, "step": 4100 }, { "epoch": 6.608695652173913, "grad_norm": 0.296875, "learning_rate": 2.354460650095602e-06, "loss": 0.1537, "step": 4104 }, { "epoch": 6.615136876006441, "grad_norm": 0.3046875, "learning_rate": 2.2777806745256534e-06, "loss": 0.1553, "step": 4108 }, { "epoch": 6.6215780998389695, "grad_norm": 0.32421875, "learning_rate": 2.202360559719918e-06, "loss": 0.1763, "step": 4112 }, { "epoch": 6.628019323671498, "grad_norm": 0.27734375, "learning_rate": 2.1282009489048847e-06, "loss": 0.1717, "step": 4116 }, { "epoch": 6.634460547504026, "grad_norm": 0.279296875, "learning_rate": 2.055302474556708e-06, "loss": 0.1761, "step": 4120 }, { "epoch": 6.640901771336554, "grad_norm": 0.30859375, "learning_rate": 1.9836657583958806e-06, "loss": 0.1569, "step": 4124 }, { "epoch": 6.647342995169082, "grad_norm": 0.353515625, "learning_rate": 1.9132914113818677e-06, "loss": 0.18, "step": 4128 }, { "epoch": 6.6537842190016105, "grad_norm": 0.27734375, "learning_rate": 1.8441800337078982e-06, "loss": 0.1563, "step": 4132 }, { "epoch": 6.660225442834139, "grad_norm": 0.28515625, "learning_rate": 1.7763322147958836e-06, "loss": 0.1637, "step": 4136 }, { "epoch": 6.666666666666667, "grad_norm": 0.296875, "learning_rate": 1.7097485332913885e-06, "loss": 0.173, "step": 4140 }, { "epoch": 6.673107890499195, "grad_norm": 0.322265625, "learning_rate": 1.6444295570586518e-06, "loss": 0.1901, "step": 4144 }, { "epoch": 6.679549114331723, "grad_norm": 0.28515625, "learning_rate": 1.580375843175824e-06, "loss": 0.1777, "step": 4148 }, { "epoch": 6.685990338164252, "grad_norm": 0.26171875, "learning_rate": 1.5175879379300704e-06, "loss": 0.1655, "step": 4152 }, { "epoch": 6.692431561996779, "grad_norm": 0.31640625, "learning_rate": 1.4560663768131253e-06, "loss": 0.1555, "step": 4156 }, { "epoch": 6.698872785829307, "grad_norm": 0.279296875, "learning_rate": 1.3958116845164958e-06, "loss": 0.1506, "step": 4160 }, { "epoch": 6.705314009661835, "grad_norm": 0.37109375, "learning_rate": 1.3368243749271813e-06, "loss": 0.1987, "step": 4164 }, { "epoch": 6.7117552334943635, "grad_norm": 0.296875, "learning_rate": 1.2791049511231277e-06, "loss": 0.1656, "step": 4168 }, { "epoch": 6.718196457326892, "grad_norm": 0.3046875, "learning_rate": 1.2226539053690975e-06, "loss": 0.193, "step": 4172 }, { "epoch": 6.72463768115942, "grad_norm": 0.296875, "learning_rate": 1.1674717191123228e-06, "loss": 0.164, "step": 4176 }, { "epoch": 6.731078904991948, "grad_norm": 0.28515625, "learning_rate": 1.1135588629785252e-06, "loss": 0.1586, "step": 4180 }, { "epoch": 6.737520128824476, "grad_norm": 0.298828125, "learning_rate": 1.0609157967677695e-06, "loss": 0.1586, "step": 4184 }, { "epoch": 6.743961352657005, "grad_norm": 0.298828125, "learning_rate": 1.0095429694506829e-06, "loss": 0.1947, "step": 4188 }, { "epoch": 6.750402576489533, "grad_norm": 0.3359375, "learning_rate": 9.594408191645254e-07, "loss": 0.1751, "step": 4192 }, { "epoch": 6.756843800322061, "grad_norm": 0.291015625, "learning_rate": 9.106097732095085e-07, "loss": 0.1543, "step": 4196 }, { "epoch": 6.763285024154589, "grad_norm": 0.322265625, "learning_rate": 8.630502480450996e-07, "loss": 0.195, "step": 4200 }, { "epoch": 6.769726247987117, "grad_norm": 0.296875, "learning_rate": 8.167626492865064e-07, "loss": 0.1806, "step": 4204 }, { "epoch": 6.776167471819646, "grad_norm": 0.30859375, "learning_rate": 7.717473717012312e-07, "loss": 0.1726, "step": 4208 }, { "epoch": 6.782608695652174, "grad_norm": 0.298828125, "learning_rate": 7.280047992056725e-07, "loss": 0.1747, "step": 4212 }, { "epoch": 6.789049919484702, "grad_norm": 0.3125, "learning_rate": 6.855353048618284e-07, "loss": 0.1681, "step": 4216 }, { "epoch": 6.79549114331723, "grad_norm": 0.298828125, "learning_rate": 6.443392508742151e-07, "loss": 0.1712, "step": 4220 }, { "epoch": 6.8019323671497585, "grad_norm": 0.28515625, "learning_rate": 6.0441698858667e-07, "loss": 0.1498, "step": 4224 }, { "epoch": 6.808373590982287, "grad_norm": 0.29296875, "learning_rate": 5.657688584793874e-07, "loss": 0.1483, "step": 4228 }, { "epoch": 6.814814814814815, "grad_norm": 0.29296875, "learning_rate": 5.283951901660366e-07, "loss": 0.1557, "step": 4232 }, { "epoch": 6.821256038647343, "grad_norm": 0.3125, "learning_rate": 4.922963023909321e-07, "loss": 0.1984, "step": 4236 }, { "epoch": 6.827697262479871, "grad_norm": 0.298828125, "learning_rate": 4.574725030263515e-07, "loss": 0.1322, "step": 4240 }, { "epoch": 6.8341384863123995, "grad_norm": 0.30859375, "learning_rate": 4.239240890698381e-07, "loss": 0.1781, "step": 4244 }, { "epoch": 6.840579710144928, "grad_norm": 0.306640625, "learning_rate": 3.9165134664170263e-07, "loss": 0.1824, "step": 4248 }, { "epoch": 6.847020933977456, "grad_norm": 0.306640625, "learning_rate": 3.6065455098264195e-07, "loss": 0.1787, "step": 4252 }, { "epoch": 6.853462157809984, "grad_norm": 0.3203125, "learning_rate": 3.309339664513078e-07, "loss": 0.1622, "step": 4256 }, { "epoch": 6.859903381642512, "grad_norm": 0.279296875, "learning_rate": 3.024898465220582e-07, "loss": 0.1756, "step": 4260 }, { "epoch": 6.8663446054750406, "grad_norm": 0.291015625, "learning_rate": 2.7532243378285966e-07, "loss": 0.1465, "step": 4264 }, { "epoch": 6.872785829307569, "grad_norm": 0.3203125, "learning_rate": 2.494319599331718e-07, "loss": 0.1519, "step": 4268 }, { "epoch": 6.879227053140097, "grad_norm": 0.314453125, "learning_rate": 2.2481864578194898e-07, "loss": 0.175, "step": 4272 }, { "epoch": 6.885668276972625, "grad_norm": 0.283203125, "learning_rate": 2.0148270124582533e-07, "loss": 0.161, "step": 4276 }, { "epoch": 6.892109500805153, "grad_norm": 0.26171875, "learning_rate": 1.7942432534728268e-07, "loss": 0.1582, "step": 4280 }, { "epoch": 6.898550724637682, "grad_norm": 0.30859375, "learning_rate": 1.5864370621293531e-07, "loss": 0.1852, "step": 4284 }, { "epoch": 6.90499194847021, "grad_norm": 0.33984375, "learning_rate": 1.3914102107193127e-07, "loss": 0.1738, "step": 4288 }, { "epoch": 6.911433172302738, "grad_norm": 0.283203125, "learning_rate": 1.2091643625452008e-07, "loss": 0.149, "step": 4292 }, { "epoch": 6.917874396135265, "grad_norm": 0.328125, "learning_rate": 1.0397010719050414e-07, "loss": 0.1963, "step": 4296 }, { "epoch": 6.9243156199677935, "grad_norm": 0.345703125, "learning_rate": 8.830217840800624e-08, "loss": 0.182, "step": 4300 }, { "epoch": 6.930756843800322, "grad_norm": 0.3203125, "learning_rate": 7.391278353223728e-08, "loss": 0.1708, "step": 4304 }, { "epoch": 6.93719806763285, "grad_norm": 0.279296875, "learning_rate": 6.080204528426391e-08, "loss": 0.1593, "step": 4308 }, { "epoch": 6.943639291465378, "grad_norm": 0.326171875, "learning_rate": 4.897007548010923e-08, "loss": 0.1506, "step": 4312 }, { "epoch": 6.950080515297906, "grad_norm": 0.328125, "learning_rate": 3.841697502963703e-08, "loss": 0.152, "step": 4316 }, { "epoch": 6.956521739130435, "grad_norm": 0.3125, "learning_rate": 2.9142833935819065e-08, "loss": 0.168, "step": 4320 }, { "epoch": 6.962962962962963, "grad_norm": 0.283203125, "learning_rate": 2.1147731293919002e-08, "loss": 0.1745, "step": 4324 }, { "epoch": 6.969404186795491, "grad_norm": 0.322265625, "learning_rate": 1.4431735290809654e-08, "loss": 0.1848, "step": 4328 }, { "epoch": 6.975845410628019, "grad_norm": 0.296875, "learning_rate": 8.994903204390113e-09, "loss": 0.1714, "step": 4332 }, { "epoch": 6.982286634460547, "grad_norm": 0.302734375, "learning_rate": 4.837281403119453e-09, "loss": 0.2034, "step": 4336 }, { "epoch": 6.988727858293076, "grad_norm": 0.330078125, "learning_rate": 1.958905345600392e-09, "loss": 0.1877, "step": 4340 }, { "epoch": 6.995169082125604, "grad_norm": 0.287109375, "learning_rate": 3.597995803128473e-10, "loss": 0.1574, "step": 4344 }, { "epoch": 7.0, "step": 4347, "total_flos": 2.864063510520791e+18, "train_loss": 0.36376489625186625, "train_runtime": 17966.6164, "train_samples_per_second": 7.736, "train_steps_per_second": 0.242 } ], "logging_steps": 4, "max_steps": 4347, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.864063510520791e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }