diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16599 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 11838, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.7875872850418091, + "learning_rate": 0.0001999999119654754, + "loss": 1.7457, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.4465181827545166, + "learning_rate": 0.00019999964786205653, + "loss": 1.5438, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.5769105553627014, + "learning_rate": 0.00019999920769020845, + "loss": 1.4644, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.5974473357200623, + "learning_rate": 0.00019999859145070615, + "loss": 1.2951, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.5096644163131714, + "learning_rate": 0.00019999779914463462, + "loss": 1.1236, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.49290400743484497, + "learning_rate": 0.0001999968307733889, + "loss": 1.0382, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.39245930314064026, + "learning_rate": 0.000199995686338674, + "loss": 1.1353, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.6507896184921265, + "learning_rate": 0.00019999436584250483, + "loss": 1.1689, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.5615530014038086, + "learning_rate": 0.00019999286928720647, + "loss": 1.1778, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.5074531435966492, + "learning_rate": 0.00019999119667541386, + "loss": 1.1218, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.5020948648452759, + "learning_rate": 0.00019998934801007193, + "loss": 1.0081, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.5447856783866882, + "learning_rate": 0.00019998732329443562, + "loss": 1.0737, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.4655187129974365, + "learning_rate": 0.00019998512253206982, + "loss": 1.1562, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.5390958189964294, + "learning_rate": 0.0001999827457268494, + "loss": 1.0221, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.47219589352607727, + "learning_rate": 0.00019998019288295922, + "loss": 1.1061, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.5623837113380432, + "learning_rate": 0.00019997746400489397, + "loss": 1.0978, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.4483198821544647, + "learning_rate": 0.00019997455909745844, + "loss": 1.1252, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.5483498573303223, + "learning_rate": 0.00019997147816576717, + "loss": 1.1272, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.5134523510932922, + "learning_rate": 0.00019996822121524485, + "loss": 1.1525, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.51983642578125, + "learning_rate": 0.00019996478825162585, + "loss": 1.3265, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.5366414189338684, + "learning_rate": 0.00019996117928095463, + "loss": 1.0426, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.5997601747512817, + "learning_rate": 0.00019995739430958545, + "loss": 1.1507, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.4979299008846283, + "learning_rate": 0.00019995343334418245, + "loss": 1.048, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.4658752381801605, + "learning_rate": 0.0001999492963917197, + "loss": 1.1218, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.5081388354301453, + "learning_rate": 0.00019994498345948108, + "loss": 1.0929, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.5739812850952148, + "learning_rate": 0.00019994049455506033, + "loss": 1.1785, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.5141093134880066, + "learning_rate": 0.00019993582968636097, + "loss": 0.9686, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.4595348834991455, + "learning_rate": 0.0001999309888615965, + "loss": 1.1072, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.44480034708976746, + "learning_rate": 0.00019992597208929, + "loss": 1.0815, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.583330512046814, + "learning_rate": 0.00019992077937827456, + "loss": 1.1567, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.42396825551986694, + "learning_rate": 0.00019991541073769283, + "loss": 1.0406, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.46554988622665405, + "learning_rate": 0.0001999098661769974, + "loss": 1.0028, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 0.41337451338768005, + "learning_rate": 0.0001999041457059505, + "loss": 1.0591, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 0.47287315130233765, + "learning_rate": 0.0001998982493346241, + "loss": 1.0964, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 0.5164761543273926, + "learning_rate": 0.0001998921770733999, + "loss": 1.0439, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 0.7466127872467041, + "learning_rate": 0.00019988592893296927, + "loss": 0.9453, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 0.5365219712257385, + "learning_rate": 0.00019987950492433325, + "loss": 0.9862, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 0.43139711022377014, + "learning_rate": 0.0001998729050588025, + "loss": 1.0837, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 0.49741968512535095, + "learning_rate": 0.0001998661293479974, + "loss": 0.9904, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.5120337009429932, + "learning_rate": 0.00019985917780384786, + "loss": 1.0652, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 0.5979030728340149, + "learning_rate": 0.00019985205043859336, + "loss": 1.0907, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 0.7089561223983765, + "learning_rate": 0.00019984474726478303, + "loss": 1.1065, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 0.5356090068817139, + "learning_rate": 0.00019983726829527547, + "loss": 1.027, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 0.5716044902801514, + "learning_rate": 0.00019982961354323887, + "loss": 1.0789, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 0.4484700560569763, + "learning_rate": 0.00019982178302215082, + "loss": 1.0366, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 0.5676905512809753, + "learning_rate": 0.00019981377674579845, + "loss": 0.9822, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 0.6980101466178894, + "learning_rate": 0.00019980559472827843, + "loss": 1.143, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 0.5166971683502197, + "learning_rate": 0.00019979723698399665, + "loss": 1.0075, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 0.5025323629379272, + "learning_rate": 0.00019978870352766853, + "loss": 0.944, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 0.49045535922050476, + "learning_rate": 0.0001997799943743189, + "loss": 1.033, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 0.5475656986236572, + "learning_rate": 0.00019977110953928182, + "loss": 0.9334, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 0.5707181096076965, + "learning_rate": 0.0001997620490382008, + "loss": 0.9527, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 0.5227631330490112, + "learning_rate": 0.0001997528128870285, + "loss": 1.0197, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 0.5836647152900696, + "learning_rate": 0.00019974340110202697, + "loss": 1.0444, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 0.6727388501167297, + "learning_rate": 0.00019973381369976746, + "loss": 1.0267, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 0.4579056203365326, + "learning_rate": 0.00019972405069713041, + "loss": 0.9956, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 0.608708918094635, + "learning_rate": 0.00019971411211130543, + "loss": 1.1209, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 0.44869670271873474, + "learning_rate": 0.00019970399795979132, + "loss": 1.089, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.47517067193984985, + "learning_rate": 0.00019969370826039592, + "loss": 0.9034, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 0.509774386882782, + "learning_rate": 0.00019968324303123625, + "loss": 0.9518, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 0.5976845622062683, + "learning_rate": 0.00019967260229073836, + "loss": 0.9457, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 0.6182131171226501, + "learning_rate": 0.00019966178605763726, + "loss": 1.1329, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 0.5523433089256287, + "learning_rate": 0.00019965079435097698, + "loss": 1.0727, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 0.6128446459770203, + "learning_rate": 0.00019963962719011055, + "loss": 1.0578, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 0.5032752752304077, + "learning_rate": 0.00019962828459469984, + "loss": 0.9942, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 0.5268670916557312, + "learning_rate": 0.0001996167665847157, + "loss": 1.0418, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 0.5722517371177673, + "learning_rate": 0.00019960507318043775, + "loss": 1.0697, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 0.5031557083129883, + "learning_rate": 0.00019959320440245443, + "loss": 1.0137, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 0.5025811791419983, + "learning_rate": 0.00019958116027166307, + "loss": 1.0486, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 0.5244653224945068, + "learning_rate": 0.00019956894080926958, + "loss": 1.0232, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 0.6231172680854797, + "learning_rate": 0.00019955654603678866, + "loss": 1.0067, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 0.5865558385848999, + "learning_rate": 0.0001995439759760437, + "loss": 1.0994, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 0.5528464913368225, + "learning_rate": 0.00019953123064916665, + "loss": 1.0448, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 0.5767375230789185, + "learning_rate": 0.00019951831007859814, + "loss": 1.0204, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 0.4730927348136902, + "learning_rate": 0.00019950521428708723, + "loss": 0.9448, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 0.6184219121932983, + "learning_rate": 0.0001994919432976916, + "loss": 1.0811, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 0.451190322637558, + "learning_rate": 0.00019947849713377734, + "loss": 1.0114, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 0.5863661766052246, + "learning_rate": 0.00019946487581901895, + "loss": 0.9805, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 0.6007524132728577, + "learning_rate": 0.00019945107937739944, + "loss": 1.0675, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 0.5558364987373352, + "learning_rate": 0.00019943710783320998, + "loss": 0.9698, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 0.5028205513954163, + "learning_rate": 0.00019942296121105017, + "loss": 0.9619, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 0.5114047527313232, + "learning_rate": 0.00019940863953582787, + "loss": 0.963, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 0.4985787570476532, + "learning_rate": 0.00019939414283275906, + "loss": 0.972, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 0.5882110595703125, + "learning_rate": 0.00019937947112736796, + "loss": 1.1282, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 0.6093044877052307, + "learning_rate": 0.00019936462444548693, + "loss": 0.931, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 0.5328900814056396, + "learning_rate": 0.00019934960281325635, + "loss": 1.0776, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 0.6070789694786072, + "learning_rate": 0.0001993344062571247, + "loss": 0.9918, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 0.47850194573402405, + "learning_rate": 0.00019931903480384838, + "loss": 0.9318, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 0.47907599806785583, + "learning_rate": 0.00019930348848049177, + "loss": 0.9338, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 0.4980204999446869, + "learning_rate": 0.00019928776731442712, + "loss": 0.9255, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 0.5016769766807556, + "learning_rate": 0.00019927187133333456, + "loss": 0.9518, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 0.5496203899383545, + "learning_rate": 0.00019925580056520198, + "loss": 0.8733, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 0.4883849024772644, + "learning_rate": 0.00019923955503832504, + "loss": 0.9664, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 0.5501581430435181, + "learning_rate": 0.00019922313478130713, + "loss": 0.9563, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 0.5573701858520508, + "learning_rate": 0.00019920653982305911, + "loss": 1.1285, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 0.63383948802948, + "learning_rate": 0.0001991897701927997, + "loss": 1.0254, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 0.6332881450653076, + "learning_rate": 0.00019917282592005496, + "loss": 1.1536, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 0.8212701678276062, + "learning_rate": 0.0001991557070346585, + "loss": 0.89, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 0.5946714878082275, + "learning_rate": 0.00019913841356675142, + "loss": 1.0984, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 0.46487969160079956, + "learning_rate": 0.00019912094554678215, + "loss": 0.8758, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 0.5632352828979492, + "learning_rate": 0.00019910330300550646, + "loss": 1.0401, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 0.6485393047332764, + "learning_rate": 0.00019908548597398742, + "loss": 0.7612, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 0.45863208174705505, + "learning_rate": 0.0001990674944835953, + "loss": 0.9827, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 0.4923437833786011, + "learning_rate": 0.00019904932856600752, + "loss": 1.0867, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 0.5223394632339478, + "learning_rate": 0.00019903098825320867, + "loss": 1.0333, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 0.5169032216072083, + "learning_rate": 0.00019901247357749036, + "loss": 1.0153, + "step": 530 + }, + { + "epoch": 0.05, + "grad_norm": 0.7526834607124329, + "learning_rate": 0.0001989937845714512, + "loss": 1.0064, + "step": 535 + }, + { + "epoch": 0.05, + "grad_norm": 0.45596638321876526, + "learning_rate": 0.00019897492126799674, + "loss": 0.8658, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 0.5975187420845032, + "learning_rate": 0.00019895588370033942, + "loss": 0.9575, + "step": 545 + }, + { + "epoch": 0.05, + "grad_norm": 0.534826934337616, + "learning_rate": 0.00019893667190199848, + "loss": 1.0665, + "step": 550 + }, + { + "epoch": 0.05, + "grad_norm": 0.5490930676460266, + "learning_rate": 0.00019891728590680003, + "loss": 1.0273, + "step": 555 + }, + { + "epoch": 0.05, + "grad_norm": 0.5129736661911011, + "learning_rate": 0.00019889772574887673, + "loss": 1.0674, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 0.6879858374595642, + "learning_rate": 0.000198877991462668, + "loss": 1.0637, + "step": 565 + }, + { + "epoch": 0.05, + "grad_norm": 0.5870321393013, + "learning_rate": 0.00019885808308291977, + "loss": 0.9868, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 0.6196910738945007, + "learning_rate": 0.00019883800064468462, + "loss": 1.0895, + "step": 575 + }, + { + "epoch": 0.05, + "grad_norm": 0.529233992099762, + "learning_rate": 0.0001988177441833214, + "loss": 0.9474, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 0.5826104283332825, + "learning_rate": 0.00019879731373449554, + "loss": 1.1702, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 0.6717869639396667, + "learning_rate": 0.00019877670933417872, + "loss": 1.0355, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 0.5537705421447754, + "learning_rate": 0.0001987559310186489, + "loss": 0.9953, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 0.5874431729316711, + "learning_rate": 0.0001987349788244903, + "loss": 1.1586, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 0.5384119749069214, + "learning_rate": 0.0001987138527885932, + "loss": 1.0429, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 0.764053225517273, + "learning_rate": 0.00019869255294815402, + "loss": 1.0185, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 0.6233411431312561, + "learning_rate": 0.00019867107934067523, + "loss": 1.0977, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 0.5031682252883911, + "learning_rate": 0.00019864943200396517, + "loss": 0.9116, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 0.6120206117630005, + "learning_rate": 0.0001986276109761381, + "loss": 1.0817, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 0.542709231376648, + "learning_rate": 0.0001986056162956141, + "loss": 1.0411, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 0.4971280097961426, + "learning_rate": 0.00019858344800111898, + "loss": 1.1277, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 0.6049294471740723, + "learning_rate": 0.0001985611061316843, + "loss": 0.9203, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 0.6006289124488831, + "learning_rate": 0.0001985385907266471, + "loss": 1.1261, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 0.5616739392280579, + "learning_rate": 0.00019851590182565012, + "loss": 1.0205, + "step": 650 + }, + { + "epoch": 0.06, + "grad_norm": 0.5229505300521851, + "learning_rate": 0.0001984930394686414, + "loss": 0.8954, + "step": 655 + }, + { + "epoch": 0.06, + "grad_norm": 0.5182190537452698, + "learning_rate": 0.00019847000369587457, + "loss": 0.9065, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 0.6021913290023804, + "learning_rate": 0.00019844679454790844, + "loss": 1.1373, + "step": 665 + }, + { + "epoch": 0.06, + "grad_norm": 0.7754777669906616, + "learning_rate": 0.00019842341206560712, + "loss": 1.0343, + "step": 670 + }, + { + "epoch": 0.06, + "grad_norm": 0.5829971432685852, + "learning_rate": 0.00019839985629013999, + "loss": 0.9031, + "step": 675 + }, + { + "epoch": 0.06, + "grad_norm": 0.5609190464019775, + "learning_rate": 0.00019837612726298143, + "loss": 1.0791, + "step": 680 + }, + { + "epoch": 0.06, + "grad_norm": 0.5505595207214355, + "learning_rate": 0.0001983522250259109, + "loss": 1.0283, + "step": 685 + }, + { + "epoch": 0.06, + "grad_norm": 0.8407588601112366, + "learning_rate": 0.0001983281496210129, + "loss": 1.0979, + "step": 690 + }, + { + "epoch": 0.06, + "grad_norm": 0.6268724203109741, + "learning_rate": 0.00019830390109067673, + "loss": 1.0197, + "step": 695 + }, + { + "epoch": 0.06, + "grad_norm": 0.6348004341125488, + "learning_rate": 0.00019827947947759653, + "loss": 0.9533, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 0.7127957344055176, + "learning_rate": 0.0001982548848247712, + "loss": 1.0689, + "step": 705 + }, + { + "epoch": 0.06, + "grad_norm": 0.6464601159095764, + "learning_rate": 0.00019823011717550438, + "loss": 1.0081, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0001982051765734042, + "loss": 0.9213, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 0.6216040253639221, + "learning_rate": 0.00019818006306238328, + "loss": 0.9828, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 0.5060167908668518, + "learning_rate": 0.0001981547766866588, + "loss": 1.0694, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 0.5995177030563354, + "learning_rate": 0.00019812931749075223, + "loss": 0.9466, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 0.6505825519561768, + "learning_rate": 0.00019810368551948936, + "loss": 0.7913, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 0.5346279144287109, + "learning_rate": 0.00019807788081800012, + "loss": 0.9923, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 0.5277034044265747, + "learning_rate": 0.00019805190343171857, + "loss": 1.0233, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 0.6092624068260193, + "learning_rate": 0.00019802575340638296, + "loss": 0.8504, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 0.5947295427322388, + "learning_rate": 0.0001979994307880353, + "loss": 1.0926, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 0.6714326739311218, + "learning_rate": 0.00019797293562302158, + "loss": 0.9823, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 0.569905698299408, + "learning_rate": 0.00019794626795799158, + "loss": 1.0132, + "step": 765 + }, + { + "epoch": 0.07, + "grad_norm": 0.5079753398895264, + "learning_rate": 0.00019791942783989889, + "loss": 0.9252, + "step": 770 + }, + { + "epoch": 0.07, + "grad_norm": 0.7192429900169373, + "learning_rate": 0.00019789241531600053, + "loss": 1.0005, + "step": 775 + }, + { + "epoch": 0.07, + "grad_norm": 0.5635434985160828, + "learning_rate": 0.00019786523043385727, + "loss": 1.0082, + "step": 780 + }, + { + "epoch": 0.07, + "grad_norm": 0.6172094941139221, + "learning_rate": 0.00019783787324133324, + "loss": 0.8479, + "step": 785 + }, + { + "epoch": 0.07, + "grad_norm": 0.5081483721733093, + "learning_rate": 0.00019781034378659604, + "loss": 0.8188, + "step": 790 + }, + { + "epoch": 0.07, + "grad_norm": 0.49087268114089966, + "learning_rate": 0.00019778264211811646, + "loss": 0.8061, + "step": 795 + }, + { + "epoch": 0.07, + "grad_norm": 0.5902777910232544, + "learning_rate": 0.0001977547682846686, + "loss": 1.0909, + "step": 800 + }, + { + "epoch": 0.07, + "grad_norm": 0.5288147926330566, + "learning_rate": 0.00019772672233532964, + "loss": 1.0024, + "step": 805 + }, + { + "epoch": 0.07, + "grad_norm": 0.46710124611854553, + "learning_rate": 0.0001976985043194798, + "loss": 0.9985, + "step": 810 + }, + { + "epoch": 0.07, + "grad_norm": 0.7614003419876099, + "learning_rate": 0.00019767011428680227, + "loss": 0.8788, + "step": 815 + }, + { + "epoch": 0.07, + "grad_norm": 0.6387910842895508, + "learning_rate": 0.00019764155228728315, + "loss": 0.8908, + "step": 820 + }, + { + "epoch": 0.07, + "grad_norm": 0.7001696228981018, + "learning_rate": 0.0001976128183712113, + "loss": 0.9774, + "step": 825 + }, + { + "epoch": 0.07, + "grad_norm": 0.6360479593276978, + "learning_rate": 0.00019758391258917814, + "loss": 1.001, + "step": 830 + }, + { + "epoch": 0.07, + "grad_norm": 0.5724257826805115, + "learning_rate": 0.0001975548349920779, + "loss": 1.0208, + "step": 835 + }, + { + "epoch": 0.07, + "grad_norm": 0.5296745300292969, + "learning_rate": 0.00019752558563110724, + "loss": 1.0957, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 0.6365566253662109, + "learning_rate": 0.0001974961645577652, + "loss": 1.0577, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 0.5774630308151245, + "learning_rate": 0.00019746657182385314, + "loss": 0.9223, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 0.7029066681861877, + "learning_rate": 0.00019743680748147478, + "loss": 0.8853, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 0.5141474604606628, + "learning_rate": 0.00019740687158303585, + "loss": 0.8765, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 0.7042635083198547, + "learning_rate": 0.0001973767641812443, + "loss": 1.0361, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 0.6724997162818909, + "learning_rate": 0.00019734648532910982, + "loss": 0.9077, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 0.5841765403747559, + "learning_rate": 0.00019731603507994416, + "loss": 1.0122, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 0.797713577747345, + "learning_rate": 0.00019728541348736084, + "loss": 0.9423, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 0.6127451062202454, + "learning_rate": 0.00019725462060527489, + "loss": 1.0396, + "step": 885 + }, + { + "epoch": 0.08, + "grad_norm": 0.5570659637451172, + "learning_rate": 0.00019722365648790313, + "loss": 0.88, + "step": 890 + }, + { + "epoch": 0.08, + "grad_norm": 0.5685396790504456, + "learning_rate": 0.00019719252118976374, + "loss": 0.9511, + "step": 895 + }, + { + "epoch": 0.08, + "grad_norm": 0.48790881037712097, + "learning_rate": 0.00019716121476567639, + "loss": 0.9494, + "step": 900 + }, + { + "epoch": 0.08, + "grad_norm": 0.7598410844802856, + "learning_rate": 0.00019712973727076195, + "loss": 1.0273, + "step": 905 + }, + { + "epoch": 0.08, + "grad_norm": 0.49358242750167847, + "learning_rate": 0.0001970980887604426, + "loss": 0.9141, + "step": 910 + }, + { + "epoch": 0.08, + "grad_norm": 0.5327233672142029, + "learning_rate": 0.0001970662692904415, + "loss": 0.9731, + "step": 915 + }, + { + "epoch": 0.08, + "grad_norm": 0.665021538734436, + "learning_rate": 0.000197034278916783, + "loss": 1.0352, + "step": 920 + }, + { + "epoch": 0.08, + "grad_norm": 0.6691067218780518, + "learning_rate": 0.00019700211769579213, + "loss": 0.9288, + "step": 925 + }, + { + "epoch": 0.08, + "grad_norm": 0.6341536641120911, + "learning_rate": 0.00019696978568409495, + "loss": 1.0483, + "step": 930 + }, + { + "epoch": 0.08, + "grad_norm": 0.6046646237373352, + "learning_rate": 0.000196937282938618, + "loss": 0.9216, + "step": 935 + }, + { + "epoch": 0.08, + "grad_norm": 0.6167765259742737, + "learning_rate": 0.0001969046095165887, + "loss": 1.036, + "step": 940 + }, + { + "epoch": 0.08, + "grad_norm": 0.5134046077728271, + "learning_rate": 0.0001968717654755347, + "loss": 0.9485, + "step": 945 + }, + { + "epoch": 0.08, + "grad_norm": 0.556530773639679, + "learning_rate": 0.00019683875087328427, + "loss": 0.9117, + "step": 950 + }, + { + "epoch": 0.08, + "grad_norm": 0.7202883958816528, + "learning_rate": 0.0001968055657679659, + "loss": 0.9137, + "step": 955 + }, + { + "epoch": 0.08, + "grad_norm": 0.6756083369255066, + "learning_rate": 0.00019677221021800824, + "loss": 0.9523, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 0.6849576830863953, + "learning_rate": 0.00019673868428214016, + "loss": 1.0056, + "step": 965 + }, + { + "epoch": 0.08, + "grad_norm": 0.5652838945388794, + "learning_rate": 0.00019670498801939044, + "loss": 0.9335, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 0.6338204741477966, + "learning_rate": 0.0001966711214890877, + "loss": 1.0252, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 0.5528327226638794, + "learning_rate": 0.0001966370847508605, + "loss": 1.0104, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 0.5702161192893982, + "learning_rate": 0.00019660287786463698, + "loss": 0.8076, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 0.5875357389450073, + "learning_rate": 0.00019656850089064484, + "loss": 1.1024, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 0.5919451117515564, + "learning_rate": 0.00019653395388941137, + "loss": 0.9424, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 0.5543453097343445, + "learning_rate": 0.00019649923692176304, + "loss": 0.9896, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 0.5521298050880432, + "learning_rate": 0.00019646435004882576, + "loss": 1.0355, + "step": 1005 + }, + { + "epoch": 0.09, + "grad_norm": 0.5741308927536011, + "learning_rate": 0.00019642929333202452, + "loss": 0.989, + "step": 1010 + }, + { + "epoch": 0.09, + "grad_norm": 0.5588503479957581, + "learning_rate": 0.00019639406683308336, + "loss": 0.9998, + "step": 1015 + }, + { + "epoch": 0.09, + "grad_norm": 0.5201951861381531, + "learning_rate": 0.00019635867061402516, + "loss": 0.9533, + "step": 1020 + }, + { + "epoch": 0.09, + "grad_norm": 0.5763158202171326, + "learning_rate": 0.00019632310473717172, + "loss": 0.9033, + "step": 1025 + }, + { + "epoch": 0.09, + "grad_norm": 0.6396028399467468, + "learning_rate": 0.00019628736926514365, + "loss": 0.9325, + "step": 1030 + }, + { + "epoch": 0.09, + "grad_norm": 0.5742698907852173, + "learning_rate": 0.00019625146426085994, + "loss": 1.0342, + "step": 1035 + }, + { + "epoch": 0.09, + "grad_norm": 0.5504851937294006, + "learning_rate": 0.00019621538978753823, + "loss": 0.9261, + "step": 1040 + }, + { + "epoch": 0.09, + "grad_norm": 0.6293525099754333, + "learning_rate": 0.00019617914590869452, + "loss": 0.9576, + "step": 1045 + }, + { + "epoch": 0.09, + "grad_norm": 0.548366367816925, + "learning_rate": 0.00019614273268814305, + "loss": 0.8242, + "step": 1050 + }, + { + "epoch": 0.09, + "grad_norm": 0.564641535282135, + "learning_rate": 0.00019610615018999622, + "loss": 1.0472, + "step": 1055 + }, + { + "epoch": 0.09, + "grad_norm": 0.6370099782943726, + "learning_rate": 0.0001960693984786645, + "loss": 0.98, + "step": 1060 + }, + { + "epoch": 0.09, + "grad_norm": 0.5116264820098877, + "learning_rate": 0.00019603247761885629, + "loss": 1.0122, + "step": 1065 + }, + { + "epoch": 0.09, + "grad_norm": 0.529659628868103, + "learning_rate": 0.00019599538767557775, + "loss": 0.8735, + "step": 1070 + }, + { + "epoch": 0.09, + "grad_norm": 0.6025382280349731, + "learning_rate": 0.00019595812871413281, + "loss": 1.0549, + "step": 1075 + }, + { + "epoch": 0.09, + "grad_norm": 0.5760678648948669, + "learning_rate": 0.00019592070080012302, + "loss": 0.9554, + "step": 1080 + }, + { + "epoch": 0.09, + "grad_norm": 0.6323814988136292, + "learning_rate": 0.00019588310399944726, + "loss": 0.9444, + "step": 1085 + }, + { + "epoch": 0.09, + "grad_norm": 0.6250112652778625, + "learning_rate": 0.00019584533837830196, + "loss": 0.9244, + "step": 1090 + }, + { + "epoch": 0.09, + "grad_norm": 0.6357011198997498, + "learning_rate": 0.00019580740400318062, + "loss": 0.8238, + "step": 1095 + }, + { + "epoch": 0.09, + "grad_norm": 0.625298023223877, + "learning_rate": 0.00019576930094087396, + "loss": 1.0092, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 0.568483829498291, + "learning_rate": 0.00019573102925846968, + "loss": 0.8903, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 0.5354487895965576, + "learning_rate": 0.00019569258902335236, + "loss": 0.9904, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 0.5910528302192688, + "learning_rate": 0.00019565398030320336, + "loss": 0.9412, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 0.6233749389648438, + "learning_rate": 0.0001956152031660007, + "loss": 0.9314, + "step": 1120 + }, + { + "epoch": 0.1, + "grad_norm": 0.5822467803955078, + "learning_rate": 0.00019557625768001886, + "loss": 0.8303, + "step": 1125 + }, + { + "epoch": 0.1, + "grad_norm": 0.6500090956687927, + "learning_rate": 0.00019553714391382887, + "loss": 0.9304, + "step": 1130 + }, + { + "epoch": 0.1, + "grad_norm": 0.5523923635482788, + "learning_rate": 0.0001954978619362979, + "loss": 1.006, + "step": 1135 + }, + { + "epoch": 0.1, + "grad_norm": 0.5451334118843079, + "learning_rate": 0.00019545841181658943, + "loss": 1.0762, + "step": 1140 + }, + { + "epoch": 0.1, + "grad_norm": 0.5570092797279358, + "learning_rate": 0.0001954187936241628, + "loss": 0.9606, + "step": 1145 + }, + { + "epoch": 0.1, + "grad_norm": 0.688922643661499, + "learning_rate": 0.00019537900742877344, + "loss": 0.9992, + "step": 1150 + }, + { + "epoch": 0.1, + "grad_norm": 0.5290732383728027, + "learning_rate": 0.00019533905330047256, + "loss": 0.9519, + "step": 1155 + }, + { + "epoch": 0.1, + "grad_norm": 0.5557504296302795, + "learning_rate": 0.000195298931309607, + "loss": 1.0104, + "step": 1160 + }, + { + "epoch": 0.1, + "grad_norm": 0.6165986657142639, + "learning_rate": 0.00019525864152681913, + "loss": 0.8128, + "step": 1165 + }, + { + "epoch": 0.1, + "grad_norm": 0.6277154088020325, + "learning_rate": 0.00019521818402304681, + "loss": 0.9949, + "step": 1170 + }, + { + "epoch": 0.1, + "grad_norm": 0.5798670053482056, + "learning_rate": 0.0001951775588695232, + "loss": 1.0184, + "step": 1175 + }, + { + "epoch": 0.1, + "grad_norm": 0.5742266774177551, + "learning_rate": 0.0001951367661377766, + "loss": 1.0086, + "step": 1180 + }, + { + "epoch": 0.1, + "grad_norm": 0.5187317728996277, + "learning_rate": 0.00019509580589963034, + "loss": 0.9316, + "step": 1185 + }, + { + "epoch": 0.1, + "grad_norm": 0.5465469360351562, + "learning_rate": 0.0001950546782272028, + "loss": 0.9607, + "step": 1190 + }, + { + "epoch": 0.1, + "grad_norm": 0.5568990707397461, + "learning_rate": 0.00019501338319290708, + "loss": 0.8688, + "step": 1195 + }, + { + "epoch": 0.1, + "grad_norm": 0.6503220200538635, + "learning_rate": 0.00019497192086945093, + "loss": 1.0604, + "step": 1200 + }, + { + "epoch": 0.1, + "grad_norm": 0.5293739438056946, + "learning_rate": 0.00019493029132983662, + "loss": 1.0046, + "step": 1205 + }, + { + "epoch": 0.1, + "grad_norm": 0.4872817099094391, + "learning_rate": 0.00019488849464736096, + "loss": 1.0668, + "step": 1210 + }, + { + "epoch": 0.1, + "grad_norm": 0.5833202600479126, + "learning_rate": 0.00019484653089561494, + "loss": 1.0663, + "step": 1215 + }, + { + "epoch": 0.1, + "grad_norm": 0.6370205879211426, + "learning_rate": 0.00019480440014848377, + "loss": 0.8139, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 0.52309250831604, + "learning_rate": 0.00019476210248014656, + "loss": 0.9474, + "step": 1225 + }, + { + "epoch": 0.1, + "grad_norm": 0.7012914419174194, + "learning_rate": 0.0001947196379650765, + "loss": 0.9654, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 0.5470362305641174, + "learning_rate": 0.00019467700667804048, + "loss": 0.9098, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 0.4781595766544342, + "learning_rate": 0.00019463420869409893, + "loss": 1.0829, + "step": 1240 + }, + { + "epoch": 0.11, + "grad_norm": 0.6375689506530762, + "learning_rate": 0.00019459124408860586, + "loss": 0.9418, + "step": 1245 + }, + { + "epoch": 0.11, + "grad_norm": 0.4911366403102875, + "learning_rate": 0.0001945481129372087, + "loss": 0.985, + "step": 1250 + }, + { + "epoch": 0.11, + "grad_norm": 0.5354143381118774, + "learning_rate": 0.000194504815315848, + "loss": 0.9878, + "step": 1255 + }, + { + "epoch": 0.11, + "grad_norm": 0.5379586815834045, + "learning_rate": 0.0001944613513007575, + "loss": 0.9631, + "step": 1260 + }, + { + "epoch": 0.11, + "grad_norm": 0.5477179884910583, + "learning_rate": 0.00019441772096846384, + "loss": 0.9537, + "step": 1265 + }, + { + "epoch": 0.11, + "grad_norm": 0.6477294564247131, + "learning_rate": 0.0001943739243957866, + "loss": 0.867, + "step": 1270 + }, + { + "epoch": 0.11, + "grad_norm": 0.4842207729816437, + "learning_rate": 0.00019432996165983797, + "loss": 0.9455, + "step": 1275 + }, + { + "epoch": 0.11, + "grad_norm": 0.6251997351646423, + "learning_rate": 0.00019428583283802265, + "loss": 1.0139, + "step": 1280 + }, + { + "epoch": 0.11, + "grad_norm": 0.5363724827766418, + "learning_rate": 0.0001942415380080379, + "loss": 0.955, + "step": 1285 + }, + { + "epoch": 0.11, + "grad_norm": 0.5772451758384705, + "learning_rate": 0.00019419707724787323, + "loss": 0.9396, + "step": 1290 + }, + { + "epoch": 0.11, + "grad_norm": 0.5976244211196899, + "learning_rate": 0.00019415245063581025, + "loss": 0.9639, + "step": 1295 + }, + { + "epoch": 0.11, + "grad_norm": 0.5342226624488831, + "learning_rate": 0.00019410765825042257, + "loss": 0.9478, + "step": 1300 + }, + { + "epoch": 0.11, + "grad_norm": 0.7162676453590393, + "learning_rate": 0.00019406270017057576, + "loss": 0.91, + "step": 1305 + }, + { + "epoch": 0.11, + "grad_norm": 0.543129026889801, + "learning_rate": 0.00019401757647542707, + "loss": 0.874, + "step": 1310 + }, + { + "epoch": 0.11, + "grad_norm": 0.6252813339233398, + "learning_rate": 0.00019397228724442537, + "loss": 0.8728, + "step": 1315 + }, + { + "epoch": 0.11, + "grad_norm": 0.5755243301391602, + "learning_rate": 0.00019392683255731096, + "loss": 1.0385, + "step": 1320 + }, + { + "epoch": 0.11, + "grad_norm": 0.7668480277061462, + "learning_rate": 0.00019388121249411553, + "loss": 1.0515, + "step": 1325 + }, + { + "epoch": 0.11, + "grad_norm": 0.5681257247924805, + "learning_rate": 0.0001938354271351618, + "loss": 0.9672, + "step": 1330 + }, + { + "epoch": 0.11, + "grad_norm": 0.6426078081130981, + "learning_rate": 0.00019378947656106373, + "loss": 0.9659, + "step": 1335 + }, + { + "epoch": 0.11, + "grad_norm": 0.596076250076294, + "learning_rate": 0.00019374336085272595, + "loss": 0.9441, + "step": 1340 + }, + { + "epoch": 0.11, + "grad_norm": 0.5348164439201355, + "learning_rate": 0.000193697080091344, + "loss": 1.008, + "step": 1345 + }, + { + "epoch": 0.11, + "grad_norm": 0.7003368139266968, + "learning_rate": 0.000193650634358404, + "loss": 1.0318, + "step": 1350 + }, + { + "epoch": 0.11, + "grad_norm": 0.5743355751037598, + "learning_rate": 0.00019360402373568247, + "loss": 1.0015, + "step": 1355 + }, + { + "epoch": 0.11, + "grad_norm": 0.6081267595291138, + "learning_rate": 0.0001935572483052463, + "loss": 1.037, + "step": 1360 + }, + { + "epoch": 0.12, + "grad_norm": 0.513016402721405, + "learning_rate": 0.00019351030814945255, + "loss": 0.9706, + "step": 1365 + }, + { + "epoch": 0.12, + "grad_norm": 0.5697603821754456, + "learning_rate": 0.0001934632033509483, + "loss": 0.7802, + "step": 1370 + }, + { + "epoch": 0.12, + "grad_norm": 0.6547832489013672, + "learning_rate": 0.00019341593399267053, + "loss": 0.8814, + "step": 1375 + }, + { + "epoch": 0.12, + "grad_norm": 0.656122088432312, + "learning_rate": 0.00019336850015784594, + "loss": 1.0709, + "step": 1380 + }, + { + "epoch": 0.12, + "grad_norm": 0.6056241989135742, + "learning_rate": 0.00019332090192999087, + "loss": 0.9867, + "step": 1385 + }, + { + "epoch": 0.12, + "grad_norm": 0.5010951161384583, + "learning_rate": 0.000193273139392911, + "loss": 0.8737, + "step": 1390 + }, + { + "epoch": 0.12, + "grad_norm": 0.5506241917610168, + "learning_rate": 0.0001932252126307014, + "loss": 0.9104, + "step": 1395 + }, + { + "epoch": 0.12, + "grad_norm": 0.5468207597732544, + "learning_rate": 0.00019317712172774632, + "loss": 0.8749, + "step": 1400 + }, + { + "epoch": 0.12, + "grad_norm": 0.5140394568443298, + "learning_rate": 0.00019312886676871888, + "loss": 0.8969, + "step": 1405 + }, + { + "epoch": 0.12, + "grad_norm": 0.5933979153633118, + "learning_rate": 0.00019308044783858115, + "loss": 0.9699, + "step": 1410 + }, + { + "epoch": 0.12, + "grad_norm": 0.6451619267463684, + "learning_rate": 0.0001930318650225839, + "loss": 1.0721, + "step": 1415 + }, + { + "epoch": 0.12, + "grad_norm": 0.6029821634292603, + "learning_rate": 0.0001929831184062664, + "loss": 1.0622, + "step": 1420 + }, + { + "epoch": 0.12, + "grad_norm": 0.5566766262054443, + "learning_rate": 0.0001929342080754564, + "loss": 1.0093, + "step": 1425 + }, + { + "epoch": 0.12, + "grad_norm": 0.5888077616691589, + "learning_rate": 0.00019288513411626983, + "loss": 0.972, + "step": 1430 + }, + { + "epoch": 0.12, + "grad_norm": 0.6099488735198975, + "learning_rate": 0.00019283589661511072, + "loss": 0.8556, + "step": 1435 + }, + { + "epoch": 0.12, + "grad_norm": 0.6205644011497498, + "learning_rate": 0.0001927864956586711, + "loss": 0.9261, + "step": 1440 + }, + { + "epoch": 0.12, + "grad_norm": 0.6054074764251709, + "learning_rate": 0.00019273693133393076, + "loss": 1.0162, + "step": 1445 + }, + { + "epoch": 0.12, + "grad_norm": 0.538075864315033, + "learning_rate": 0.00019268720372815713, + "loss": 0.9089, + "step": 1450 + }, + { + "epoch": 0.12, + "grad_norm": 0.5652662515640259, + "learning_rate": 0.00019263731292890515, + "loss": 0.959, + "step": 1455 + }, + { + "epoch": 0.12, + "grad_norm": 0.6320458650588989, + "learning_rate": 0.00019258725902401703, + "loss": 1.1716, + "step": 1460 + }, + { + "epoch": 0.12, + "grad_norm": 0.5600968599319458, + "learning_rate": 0.00019253704210162224, + "loss": 0.8629, + "step": 1465 + }, + { + "epoch": 0.12, + "grad_norm": 0.6069446206092834, + "learning_rate": 0.00019248666225013726, + "loss": 1.0024, + "step": 1470 + }, + { + "epoch": 0.12, + "grad_norm": 0.5121304988861084, + "learning_rate": 0.00019243611955826537, + "loss": 0.8568, + "step": 1475 + }, + { + "epoch": 0.13, + "grad_norm": 0.5882970094680786, + "learning_rate": 0.00019238541411499663, + "loss": 0.9581, + "step": 1480 + }, + { + "epoch": 0.13, + "grad_norm": 0.5393926501274109, + "learning_rate": 0.0001923345460096076, + "loss": 0.9318, + "step": 1485 + }, + { + "epoch": 0.13, + "grad_norm": 0.6565856337547302, + "learning_rate": 0.00019228351533166134, + "loss": 1.0293, + "step": 1490 + }, + { + "epoch": 0.13, + "grad_norm": 0.6271263360977173, + "learning_rate": 0.000192232322171007, + "loss": 0.9848, + "step": 1495 + }, + { + "epoch": 0.13, + "grad_norm": 0.5515784025192261, + "learning_rate": 0.00019218096661777992, + "loss": 0.9167, + "step": 1500 + }, + { + "epoch": 0.13, + "grad_norm": 0.6837130188941956, + "learning_rate": 0.00019212944876240137, + "loss": 0.9197, + "step": 1505 + }, + { + "epoch": 0.13, + "grad_norm": 0.5938873887062073, + "learning_rate": 0.00019207776869557833, + "loss": 0.9345, + "step": 1510 + }, + { + "epoch": 0.13, + "grad_norm": 0.7142723798751831, + "learning_rate": 0.00019202592650830337, + "loss": 0.9954, + "step": 1515 + }, + { + "epoch": 0.13, + "grad_norm": 0.500342071056366, + "learning_rate": 0.00019197392229185453, + "loss": 0.9401, + "step": 1520 + }, + { + "epoch": 0.13, + "grad_norm": 0.643747091293335, + "learning_rate": 0.0001919217561377952, + "loss": 0.8676, + "step": 1525 + }, + { + "epoch": 0.13, + "grad_norm": 0.6643776297569275, + "learning_rate": 0.0001918694281379738, + "loss": 1.0773, + "step": 1530 + }, + { + "epoch": 0.13, + "grad_norm": 0.6194174885749817, + "learning_rate": 0.0001918169383845237, + "loss": 1.0095, + "step": 1535 + }, + { + "epoch": 0.13, + "grad_norm": 0.5246191024780273, + "learning_rate": 0.0001917642869698632, + "loss": 0.9783, + "step": 1540 + }, + { + "epoch": 0.13, + "grad_norm": 0.5903621912002563, + "learning_rate": 0.0001917114739866951, + "loss": 1.0164, + "step": 1545 + }, + { + "epoch": 0.13, + "grad_norm": 0.5673597455024719, + "learning_rate": 0.00019165849952800667, + "loss": 0.7935, + "step": 1550 + }, + { + "epoch": 0.13, + "grad_norm": 0.6583690643310547, + "learning_rate": 0.0001916053636870696, + "loss": 1.0241, + "step": 1555 + }, + { + "epoch": 0.13, + "grad_norm": 0.629821240901947, + "learning_rate": 0.00019155206655743965, + "loss": 0.9434, + "step": 1560 + }, + { + "epoch": 0.13, + "grad_norm": 0.6511532068252563, + "learning_rate": 0.00019149860823295656, + "loss": 1.1081, + "step": 1565 + }, + { + "epoch": 0.13, + "grad_norm": 0.517203152179718, + "learning_rate": 0.00019144498880774386, + "loss": 0.8622, + "step": 1570 + }, + { + "epoch": 0.13, + "grad_norm": 0.6585999727249146, + "learning_rate": 0.00019139120837620882, + "loss": 1.0642, + "step": 1575 + }, + { + "epoch": 0.13, + "grad_norm": 0.7876483798027039, + "learning_rate": 0.00019133726703304208, + "loss": 0.9939, + "step": 1580 + }, + { + "epoch": 0.13, + "grad_norm": 0.608589231967926, + "learning_rate": 0.00019128316487321772, + "loss": 0.9917, + "step": 1585 + }, + { + "epoch": 0.13, + "grad_norm": 0.5026513934135437, + "learning_rate": 0.00019122890199199284, + "loss": 0.8672, + "step": 1590 + }, + { + "epoch": 0.13, + "grad_norm": 0.5934941172599792, + "learning_rate": 0.0001911744784849076, + "loss": 0.8947, + "step": 1595 + }, + { + "epoch": 0.14, + "grad_norm": 0.5116546154022217, + "learning_rate": 0.00019111989444778492, + "loss": 1.0184, + "step": 1600 + }, + { + "epoch": 0.14, + "grad_norm": 0.7126646637916565, + "learning_rate": 0.00019106514997673047, + "loss": 0.9383, + "step": 1605 + }, + { + "epoch": 0.14, + "grad_norm": 0.6720495223999023, + "learning_rate": 0.00019101024516813224, + "loss": 1.0264, + "step": 1610 + }, + { + "epoch": 0.14, + "grad_norm": 0.5357206463813782, + "learning_rate": 0.00019095518011866063, + "loss": 1.0058, + "step": 1615 + }, + { + "epoch": 0.14, + "grad_norm": 0.6155744791030884, + "learning_rate": 0.0001908999549252682, + "loss": 1.0121, + "step": 1620 + }, + { + "epoch": 0.14, + "grad_norm": 0.6566094160079956, + "learning_rate": 0.0001908445696851893, + "loss": 0.9829, + "step": 1625 + }, + { + "epoch": 0.14, + "grad_norm": 0.5650537610054016, + "learning_rate": 0.00019078902449594032, + "loss": 0.923, + "step": 1630 + }, + { + "epoch": 0.14, + "grad_norm": 0.5771694779396057, + "learning_rate": 0.00019073331945531908, + "loss": 0.9511, + "step": 1635 + }, + { + "epoch": 0.14, + "grad_norm": 0.6931601166725159, + "learning_rate": 0.00019067745466140495, + "loss": 0.8401, + "step": 1640 + }, + { + "epoch": 0.14, + "grad_norm": 0.5769656300544739, + "learning_rate": 0.0001906214302125586, + "loss": 0.9956, + "step": 1645 + }, + { + "epoch": 0.14, + "grad_norm": 0.5963305830955505, + "learning_rate": 0.00019056524620742157, + "loss": 0.9114, + "step": 1650 + }, + { + "epoch": 0.14, + "grad_norm": 0.4932982325553894, + "learning_rate": 0.00019050890274491665, + "loss": 0.9108, + "step": 1655 + }, + { + "epoch": 0.14, + "grad_norm": 0.8321012854576111, + "learning_rate": 0.00019045239992424717, + "loss": 1.0043, + "step": 1660 + }, + { + "epoch": 0.14, + "grad_norm": 0.5227043032646179, + "learning_rate": 0.00019039573784489716, + "loss": 0.9348, + "step": 1665 + }, + { + "epoch": 0.14, + "grad_norm": 0.5955290794372559, + "learning_rate": 0.00019033891660663098, + "loss": 1.0376, + "step": 1670 + }, + { + "epoch": 0.14, + "grad_norm": 0.532315731048584, + "learning_rate": 0.00019028193630949323, + "loss": 0.9035, + "step": 1675 + }, + { + "epoch": 0.14, + "grad_norm": 0.6716378927230835, + "learning_rate": 0.00019022479705380857, + "loss": 0.9297, + "step": 1680 + }, + { + "epoch": 0.14, + "grad_norm": 0.7406010031700134, + "learning_rate": 0.0001901674989401816, + "loss": 0.8579, + "step": 1685 + }, + { + "epoch": 0.14, + "grad_norm": 0.7114114761352539, + "learning_rate": 0.00019011004206949652, + "loss": 0.9276, + "step": 1690 + }, + { + "epoch": 0.14, + "grad_norm": 0.6022518873214722, + "learning_rate": 0.00019005242654291708, + "loss": 1.0488, + "step": 1695 + }, + { + "epoch": 0.14, + "grad_norm": 0.6278284788131714, + "learning_rate": 0.00018999465246188644, + "loss": 1.0768, + "step": 1700 + }, + { + "epoch": 0.14, + "grad_norm": 0.579444408416748, + "learning_rate": 0.00018993671992812683, + "loss": 0.9125, + "step": 1705 + }, + { + "epoch": 0.14, + "grad_norm": 0.6062266826629639, + "learning_rate": 0.00018987862904363954, + "loss": 1.0356, + "step": 1710 + }, + { + "epoch": 0.14, + "grad_norm": 0.7265365123748779, + "learning_rate": 0.00018982037991070462, + "loss": 0.9464, + "step": 1715 + }, + { + "epoch": 0.15, + "grad_norm": 0.5000131130218506, + "learning_rate": 0.00018976197263188079, + "loss": 0.9396, + "step": 1720 + }, + { + "epoch": 0.15, + "grad_norm": 0.555634617805481, + "learning_rate": 0.00018970340731000516, + "loss": 0.9729, + "step": 1725 + }, + { + "epoch": 0.15, + "grad_norm": 0.6329852938652039, + "learning_rate": 0.00018964468404819313, + "loss": 0.8783, + "step": 1730 + }, + { + "epoch": 0.15, + "grad_norm": 0.6367356777191162, + "learning_rate": 0.00018958580294983822, + "loss": 1.0654, + "step": 1735 + }, + { + "epoch": 0.15, + "grad_norm": 0.5371712446212769, + "learning_rate": 0.00018952676411861184, + "loss": 0.9471, + "step": 1740 + }, + { + "epoch": 0.15, + "grad_norm": 0.5458500385284424, + "learning_rate": 0.00018946756765846304, + "loss": 0.9233, + "step": 1745 + }, + { + "epoch": 0.15, + "grad_norm": 0.6498488187789917, + "learning_rate": 0.00018940821367361847, + "loss": 0.9091, + "step": 1750 + }, + { + "epoch": 0.15, + "grad_norm": 0.6993901133537292, + "learning_rate": 0.00018934870226858217, + "loss": 0.9276, + "step": 1755 + }, + { + "epoch": 0.15, + "grad_norm": 0.6266675591468811, + "learning_rate": 0.0001892890335481353, + "loss": 0.8598, + "step": 1760 + }, + { + "epoch": 0.15, + "grad_norm": 0.5299180150032043, + "learning_rate": 0.00018922920761733596, + "loss": 0.925, + "step": 1765 + }, + { + "epoch": 0.15, + "grad_norm": 0.5607171654701233, + "learning_rate": 0.00018916922458151914, + "loss": 0.9816, + "step": 1770 + }, + { + "epoch": 0.15, + "grad_norm": 0.5217439532279968, + "learning_rate": 0.0001891090845462964, + "loss": 0.8764, + "step": 1775 + }, + { + "epoch": 0.15, + "grad_norm": 0.5624527931213379, + "learning_rate": 0.00018904878761755569, + "loss": 0.8804, + "step": 1780 + }, + { + "epoch": 0.15, + "grad_norm": 0.6318154335021973, + "learning_rate": 0.0001889883339014613, + "loss": 0.9651, + "step": 1785 + }, + { + "epoch": 0.15, + "grad_norm": 0.6358602046966553, + "learning_rate": 0.00018892772350445345, + "loss": 0.9051, + "step": 1790 + }, + { + "epoch": 0.15, + "grad_norm": 0.6364490389823914, + "learning_rate": 0.00018886695653324832, + "loss": 0.993, + "step": 1795 + }, + { + "epoch": 0.15, + "grad_norm": 0.64936363697052, + "learning_rate": 0.00018880603309483776, + "loss": 0.866, + "step": 1800 + }, + { + "epoch": 0.15, + "grad_norm": 0.6414142847061157, + "learning_rate": 0.00018874495329648908, + "loss": 1.0359, + "step": 1805 + }, + { + "epoch": 0.15, + "grad_norm": 0.6995022892951965, + "learning_rate": 0.00018868371724574488, + "loss": 0.8347, + "step": 1810 + }, + { + "epoch": 0.15, + "grad_norm": 0.5469136238098145, + "learning_rate": 0.00018862232505042288, + "loss": 0.9802, + "step": 1815 + }, + { + "epoch": 0.15, + "grad_norm": 0.7556516528129578, + "learning_rate": 0.00018856077681861578, + "loss": 0.9124, + "step": 1820 + }, + { + "epoch": 0.15, + "grad_norm": 0.6139054298400879, + "learning_rate": 0.0001884990726586909, + "loss": 0.9399, + "step": 1825 + }, + { + "epoch": 0.15, + "grad_norm": 0.5691984295845032, + "learning_rate": 0.00018843721267929023, + "loss": 0.8927, + "step": 1830 + }, + { + "epoch": 0.16, + "grad_norm": 0.7042884826660156, + "learning_rate": 0.00018837519698933002, + "loss": 1.0038, + "step": 1835 + }, + { + "epoch": 0.16, + "grad_norm": 0.6047410368919373, + "learning_rate": 0.00018831302569800073, + "loss": 0.879, + "step": 1840 + }, + { + "epoch": 0.16, + "grad_norm": 0.6905311346054077, + "learning_rate": 0.00018825069891476671, + "loss": 0.9217, + "step": 1845 + }, + { + "epoch": 0.16, + "grad_norm": 0.5696797966957092, + "learning_rate": 0.00018818821674936623, + "loss": 0.9863, + "step": 1850 + }, + { + "epoch": 0.16, + "grad_norm": 0.5839678645133972, + "learning_rate": 0.00018812557931181093, + "loss": 0.931, + "step": 1855 + }, + { + "epoch": 0.16, + "grad_norm": 0.6562358736991882, + "learning_rate": 0.000188062786712386, + "loss": 0.9697, + "step": 1860 + }, + { + "epoch": 0.16, + "grad_norm": 0.49899277091026306, + "learning_rate": 0.00018799983906164983, + "loss": 0.9136, + "step": 1865 + }, + { + "epoch": 0.16, + "grad_norm": 0.6238011121749878, + "learning_rate": 0.00018793673647043364, + "loss": 0.9229, + "step": 1870 + }, + { + "epoch": 0.16, + "grad_norm": 0.6438485383987427, + "learning_rate": 0.00018787347904984165, + "loss": 1.0229, + "step": 1875 + }, + { + "epoch": 0.16, + "grad_norm": 0.546998918056488, + "learning_rate": 0.00018781006691125053, + "loss": 0.8177, + "step": 1880 + }, + { + "epoch": 0.16, + "grad_norm": 0.6244615316390991, + "learning_rate": 0.0001877465001663095, + "loss": 1.0081, + "step": 1885 + }, + { + "epoch": 0.16, + "grad_norm": 0.6127185821533203, + "learning_rate": 0.0001876827789269399, + "loss": 0.9392, + "step": 1890 + }, + { + "epoch": 0.16, + "grad_norm": 0.7033048272132874, + "learning_rate": 0.0001876189033053351, + "loss": 1.0329, + "step": 1895 + }, + { + "epoch": 0.16, + "grad_norm": 0.7424870133399963, + "learning_rate": 0.00018755487341396028, + "loss": 1.0154, + "step": 1900 + }, + { + "epoch": 0.16, + "grad_norm": 0.5857945680618286, + "learning_rate": 0.00018749068936555228, + "loss": 1.0817, + "step": 1905 + }, + { + "epoch": 0.16, + "grad_norm": 0.6423769593238831, + "learning_rate": 0.00018742635127311935, + "loss": 0.9001, + "step": 1910 + }, + { + "epoch": 0.16, + "grad_norm": 0.5362260341644287, + "learning_rate": 0.00018736185924994096, + "loss": 0.8897, + "step": 1915 + }, + { + "epoch": 0.16, + "grad_norm": 0.5578159093856812, + "learning_rate": 0.00018729721340956758, + "loss": 0.9347, + "step": 1920 + }, + { + "epoch": 0.16, + "grad_norm": 0.5923319458961487, + "learning_rate": 0.0001872324138658206, + "loss": 0.9671, + "step": 1925 + }, + { + "epoch": 0.16, + "grad_norm": 0.5793907046318054, + "learning_rate": 0.00018716746073279184, + "loss": 0.8519, + "step": 1930 + }, + { + "epoch": 0.16, + "grad_norm": 0.5267802476882935, + "learning_rate": 0.00018710235412484373, + "loss": 0.9927, + "step": 1935 + }, + { + "epoch": 0.16, + "grad_norm": 0.5435567498207092, + "learning_rate": 0.00018703709415660887, + "loss": 0.8592, + "step": 1940 + }, + { + "epoch": 0.16, + "grad_norm": 0.7846719622612, + "learning_rate": 0.00018697168094298984, + "loss": 0.9461, + "step": 1945 + }, + { + "epoch": 0.16, + "grad_norm": 0.7979434728622437, + "learning_rate": 0.00018690611459915908, + "loss": 0.9974, + "step": 1950 + }, + { + "epoch": 0.17, + "grad_norm": 0.6998960375785828, + "learning_rate": 0.00018684039524055862, + "loss": 1.0231, + "step": 1955 + }, + { + "epoch": 0.17, + "grad_norm": 0.6466115117073059, + "learning_rate": 0.0001867745229828999, + "loss": 0.9781, + "step": 1960 + }, + { + "epoch": 0.17, + "grad_norm": 0.7325738072395325, + "learning_rate": 0.00018670849794216355, + "loss": 0.8436, + "step": 1965 + }, + { + "epoch": 0.17, + "grad_norm": 0.5265980362892151, + "learning_rate": 0.00018664232023459933, + "loss": 1.0024, + "step": 1970 + }, + { + "epoch": 0.17, + "grad_norm": 0.6271637082099915, + "learning_rate": 0.00018657598997672562, + "loss": 1.1172, + "step": 1975 + }, + { + "epoch": 0.17, + "grad_norm": 0.664203405380249, + "learning_rate": 0.00018650950728532948, + "loss": 1.0014, + "step": 1980 + }, + { + "epoch": 0.17, + "grad_norm": 0.6504744291305542, + "learning_rate": 0.00018644287227746636, + "loss": 0.9328, + "step": 1985 + }, + { + "epoch": 0.17, + "grad_norm": 0.6602086424827576, + "learning_rate": 0.0001863760850704599, + "loss": 1.1187, + "step": 1990 + }, + { + "epoch": 0.17, + "grad_norm": 0.5959579348564148, + "learning_rate": 0.0001863091457819017, + "loss": 0.8691, + "step": 1995 + }, + { + "epoch": 0.17, + "grad_norm": 0.5921610593795776, + "learning_rate": 0.00018624205452965112, + "loss": 1.0163, + "step": 2000 + }, + { + "epoch": 0.17, + "grad_norm": 0.5723516345024109, + "learning_rate": 0.00018617481143183508, + "loss": 1.004, + "step": 2005 + }, + { + "epoch": 0.17, + "grad_norm": 0.5717443823814392, + "learning_rate": 0.00018610741660684784, + "loss": 0.9705, + "step": 2010 + }, + { + "epoch": 0.17, + "grad_norm": 0.6303788423538208, + "learning_rate": 0.00018603987017335092, + "loss": 0.9824, + "step": 2015 + }, + { + "epoch": 0.17, + "grad_norm": 0.7067974805831909, + "learning_rate": 0.0001859721722502726, + "loss": 0.9011, + "step": 2020 + }, + { + "epoch": 0.17, + "grad_norm": 0.5747745633125305, + "learning_rate": 0.000185904322956808, + "loss": 0.9616, + "step": 2025 + }, + { + "epoch": 0.17, + "grad_norm": 0.5667222142219543, + "learning_rate": 0.0001858363224124187, + "loss": 0.8805, + "step": 2030 + }, + { + "epoch": 0.17, + "grad_norm": 0.7209914326667786, + "learning_rate": 0.0001857681707368326, + "loss": 1.0061, + "step": 2035 + }, + { + "epoch": 0.17, + "grad_norm": 0.5965637564659119, + "learning_rate": 0.0001856998680500438, + "loss": 0.9908, + "step": 2040 + }, + { + "epoch": 0.17, + "grad_norm": 0.5698437094688416, + "learning_rate": 0.00018563141447231211, + "loss": 1.0224, + "step": 2045 + }, + { + "epoch": 0.17, + "grad_norm": 0.5468728542327881, + "learning_rate": 0.0001855628101241631, + "loss": 0.9173, + "step": 2050 + }, + { + "epoch": 0.17, + "grad_norm": 0.7616835832595825, + "learning_rate": 0.00018549405512638783, + "loss": 1.0199, + "step": 2055 + }, + { + "epoch": 0.17, + "grad_norm": 0.6229009032249451, + "learning_rate": 0.00018542514960004253, + "loss": 0.9385, + "step": 2060 + }, + { + "epoch": 0.17, + "grad_norm": 0.6399763226509094, + "learning_rate": 0.0001853560936664485, + "loss": 0.9203, + "step": 2065 + }, + { + "epoch": 0.17, + "grad_norm": 0.6714853048324585, + "learning_rate": 0.00018528688744719193, + "loss": 0.8532, + "step": 2070 + }, + { + "epoch": 0.18, + "grad_norm": 0.6095345616340637, + "learning_rate": 0.0001852175310641235, + "loss": 0.9489, + "step": 2075 + }, + { + "epoch": 0.18, + "grad_norm": 0.6012184619903564, + "learning_rate": 0.00018514802463935834, + "loss": 1.0208, + "step": 2080 + }, + { + "epoch": 0.18, + "grad_norm": 0.4830770492553711, + "learning_rate": 0.00018507836829527574, + "loss": 0.939, + "step": 2085 + }, + { + "epoch": 0.18, + "grad_norm": 0.6476888656616211, + "learning_rate": 0.000185008562154519, + "loss": 0.9238, + "step": 2090 + }, + { + "epoch": 0.18, + "grad_norm": 0.6635558009147644, + "learning_rate": 0.00018493860633999508, + "loss": 0.8019, + "step": 2095 + }, + { + "epoch": 0.18, + "grad_norm": 0.5985074639320374, + "learning_rate": 0.00018486850097487457, + "loss": 0.9802, + "step": 2100 + }, + { + "epoch": 0.18, + "grad_norm": 0.6918281316757202, + "learning_rate": 0.00018479824618259128, + "loss": 1.0577, + "step": 2105 + }, + { + "epoch": 0.18, + "grad_norm": 0.617668867111206, + "learning_rate": 0.0001847278420868422, + "loss": 0.9448, + "step": 2110 + }, + { + "epoch": 0.18, + "grad_norm": 0.5378293991088867, + "learning_rate": 0.00018465728881158708, + "loss": 0.8489, + "step": 2115 + }, + { + "epoch": 0.18, + "grad_norm": 0.5690131187438965, + "learning_rate": 0.00018458658648104844, + "loss": 1.0106, + "step": 2120 + }, + { + "epoch": 0.18, + "grad_norm": 0.5910131335258484, + "learning_rate": 0.00018451573521971123, + "loss": 0.9338, + "step": 2125 + }, + { + "epoch": 0.18, + "grad_norm": 0.6318804025650024, + "learning_rate": 0.00018444473515232256, + "loss": 0.9011, + "step": 2130 + }, + { + "epoch": 0.18, + "grad_norm": 0.527715265750885, + "learning_rate": 0.00018437358640389158, + "loss": 0.8679, + "step": 2135 + }, + { + "epoch": 0.18, + "grad_norm": 0.5651088953018188, + "learning_rate": 0.00018430228909968921, + "loss": 0.9532, + "step": 2140 + }, + { + "epoch": 0.18, + "grad_norm": 0.5673348903656006, + "learning_rate": 0.00018423084336524793, + "loss": 0.9157, + "step": 2145 + }, + { + "epoch": 0.18, + "grad_norm": 0.6541638374328613, + "learning_rate": 0.00018415924932636157, + "loss": 0.7572, + "step": 2150 + }, + { + "epoch": 0.18, + "grad_norm": 0.6314670443534851, + "learning_rate": 0.0001840875071090851, + "loss": 0.917, + "step": 2155 + }, + { + "epoch": 0.18, + "grad_norm": 0.5921071171760559, + "learning_rate": 0.00018401561683973434, + "loss": 0.939, + "step": 2160 + }, + { + "epoch": 0.18, + "grad_norm": 0.6067410707473755, + "learning_rate": 0.0001839435786448858, + "loss": 0.9877, + "step": 2165 + }, + { + "epoch": 0.18, + "grad_norm": 0.6221805214881897, + "learning_rate": 0.00018387139265137642, + "loss": 0.9306, + "step": 2170 + }, + { + "epoch": 0.18, + "grad_norm": 0.5830126404762268, + "learning_rate": 0.00018379905898630345, + "loss": 0.8507, + "step": 2175 + }, + { + "epoch": 0.18, + "grad_norm": 0.5757225751876831, + "learning_rate": 0.00018372657777702406, + "loss": 0.969, + "step": 2180 + }, + { + "epoch": 0.18, + "grad_norm": 0.7756751775741577, + "learning_rate": 0.00018365394915115517, + "loss": 1.1542, + "step": 2185 + }, + { + "epoch": 0.18, + "grad_norm": 0.6099016666412354, + "learning_rate": 0.0001835811732365734, + "loss": 1.0071, + "step": 2190 + }, + { + "epoch": 0.19, + "grad_norm": 0.7187824249267578, + "learning_rate": 0.00018350825016141457, + "loss": 0.9009, + "step": 2195 + }, + { + "epoch": 0.19, + "grad_norm": 0.7240955233573914, + "learning_rate": 0.00018343518005407367, + "loss": 0.8871, + "step": 2200 + }, + { + "epoch": 0.19, + "grad_norm": 0.7031527757644653, + "learning_rate": 0.0001833619630432045, + "loss": 1.0222, + "step": 2205 + }, + { + "epoch": 0.19, + "grad_norm": 0.7623906135559082, + "learning_rate": 0.00018328859925771958, + "loss": 0.9689, + "step": 2210 + }, + { + "epoch": 0.19, + "grad_norm": 0.6677159667015076, + "learning_rate": 0.0001832150888267898, + "loss": 0.8876, + "step": 2215 + }, + { + "epoch": 0.19, + "grad_norm": 0.579214334487915, + "learning_rate": 0.00018314143187984433, + "loss": 0.9057, + "step": 2220 + }, + { + "epoch": 0.19, + "grad_norm": 0.7031520009040833, + "learning_rate": 0.00018306762854657023, + "loss": 1.0086, + "step": 2225 + }, + { + "epoch": 0.19, + "grad_norm": 0.6051443219184875, + "learning_rate": 0.00018299367895691234, + "loss": 0.8878, + "step": 2230 + }, + { + "epoch": 0.19, + "grad_norm": 0.614453911781311, + "learning_rate": 0.00018291958324107298, + "loss": 0.9012, + "step": 2235 + }, + { + "epoch": 0.19, + "grad_norm": 0.7160709500312805, + "learning_rate": 0.00018284534152951176, + "loss": 0.9625, + "step": 2240 + }, + { + "epoch": 0.19, + "grad_norm": 0.5451361536979675, + "learning_rate": 0.00018277095395294538, + "loss": 0.9954, + "step": 2245 + }, + { + "epoch": 0.19, + "grad_norm": 0.5429083704948425, + "learning_rate": 0.00018269642064234733, + "loss": 0.8744, + "step": 2250 + }, + { + "epoch": 0.19, + "grad_norm": 0.6032977104187012, + "learning_rate": 0.0001826217417289477, + "loss": 0.93, + "step": 2255 + }, + { + "epoch": 0.19, + "grad_norm": 0.5320645570755005, + "learning_rate": 0.00018254691734423295, + "loss": 0.8629, + "step": 2260 + }, + { + "epoch": 0.19, + "grad_norm": 0.5685572028160095, + "learning_rate": 0.00018247194761994567, + "loss": 1.0152, + "step": 2265 + }, + { + "epoch": 0.19, + "grad_norm": 0.5646812319755554, + "learning_rate": 0.00018239683268808432, + "loss": 1.0437, + "step": 2270 + }, + { + "epoch": 0.19, + "grad_norm": 0.5840547680854797, + "learning_rate": 0.00018232157268090307, + "loss": 1.0193, + "step": 2275 + }, + { + "epoch": 0.19, + "grad_norm": 0.5709146857261658, + "learning_rate": 0.00018224616773091147, + "loss": 0.9139, + "step": 2280 + }, + { + "epoch": 0.19, + "grad_norm": 0.7946630120277405, + "learning_rate": 0.00018217061797087434, + "loss": 0.8473, + "step": 2285 + }, + { + "epoch": 0.19, + "grad_norm": 0.6987430453300476, + "learning_rate": 0.00018209492353381138, + "loss": 0.9721, + "step": 2290 + }, + { + "epoch": 0.19, + "grad_norm": 0.7423457503318787, + "learning_rate": 0.00018201908455299707, + "loss": 0.9289, + "step": 2295 + }, + { + "epoch": 0.19, + "grad_norm": 0.6933215856552124, + "learning_rate": 0.00018194310116196043, + "loss": 0.9983, + "step": 2300 + }, + { + "epoch": 0.19, + "grad_norm": 0.5887870192527771, + "learning_rate": 0.00018186697349448463, + "loss": 0.9486, + "step": 2305 + }, + { + "epoch": 0.2, + "grad_norm": 0.6001654267311096, + "learning_rate": 0.000181790701684607, + "loss": 0.9821, + "step": 2310 + }, + { + "epoch": 0.2, + "grad_norm": 0.6377297043800354, + "learning_rate": 0.0001817142858666185, + "loss": 1.0207, + "step": 2315 + }, + { + "epoch": 0.2, + "grad_norm": 0.6929865479469299, + "learning_rate": 0.00018163772617506383, + "loss": 0.9276, + "step": 2320 + }, + { + "epoch": 0.2, + "grad_norm": 0.6459087133407593, + "learning_rate": 0.00018156102274474086, + "loss": 0.9748, + "step": 2325 + }, + { + "epoch": 0.2, + "grad_norm": 0.5593891143798828, + "learning_rate": 0.00018148417571070056, + "loss": 0.8451, + "step": 2330 + }, + { + "epoch": 0.2, + "grad_norm": 0.7829484939575195, + "learning_rate": 0.00018140718520824684, + "loss": 0.8557, + "step": 2335 + }, + { + "epoch": 0.2, + "grad_norm": 0.6354033350944519, + "learning_rate": 0.0001813300513729361, + "loss": 1.0291, + "step": 2340 + }, + { + "epoch": 0.2, + "grad_norm": 0.5774961709976196, + "learning_rate": 0.0001812527743405772, + "loss": 0.917, + "step": 2345 + }, + { + "epoch": 0.2, + "grad_norm": 0.632072925567627, + "learning_rate": 0.00018117535424723102, + "loss": 0.9833, + "step": 2350 + }, + { + "epoch": 0.2, + "grad_norm": 0.55246901512146, + "learning_rate": 0.0001810977912292104, + "loss": 1.0615, + "step": 2355 + }, + { + "epoch": 0.2, + "grad_norm": 0.6602877974510193, + "learning_rate": 0.00018102008542307982, + "loss": 1.0171, + "step": 2360 + }, + { + "epoch": 0.2, + "grad_norm": 0.487744003534317, + "learning_rate": 0.00018094223696565512, + "loss": 0.7264, + "step": 2365 + }, + { + "epoch": 0.2, + "grad_norm": 0.6929968595504761, + "learning_rate": 0.0001808642459940034, + "loss": 0.9428, + "step": 2370 + }, + { + "epoch": 0.2, + "grad_norm": 0.7799659371376038, + "learning_rate": 0.0001807861126454426, + "loss": 0.945, + "step": 2375 + }, + { + "epoch": 0.2, + "grad_norm": 0.5674405694007874, + "learning_rate": 0.00018070783705754134, + "loss": 1.0994, + "step": 2380 + }, + { + "epoch": 0.2, + "grad_norm": 0.5485437512397766, + "learning_rate": 0.00018062941936811868, + "loss": 0.9951, + "step": 2385 + }, + { + "epoch": 0.2, + "grad_norm": 0.5618184804916382, + "learning_rate": 0.00018055085971524398, + "loss": 0.9063, + "step": 2390 + }, + { + "epoch": 0.2, + "grad_norm": 0.6102981567382812, + "learning_rate": 0.0001804721582372364, + "loss": 0.9701, + "step": 2395 + }, + { + "epoch": 0.2, + "grad_norm": 0.506664514541626, + "learning_rate": 0.00018039331507266492, + "loss": 0.9649, + "step": 2400 + }, + { + "epoch": 0.2, + "grad_norm": 0.7159459590911865, + "learning_rate": 0.00018031433036034793, + "loss": 0.9289, + "step": 2405 + }, + { + "epoch": 0.2, + "grad_norm": 0.9469946026802063, + "learning_rate": 0.0001802352042393531, + "loss": 0.8951, + "step": 2410 + }, + { + "epoch": 0.2, + "grad_norm": 0.5196167230606079, + "learning_rate": 0.00018015593684899702, + "loss": 0.9079, + "step": 2415 + }, + { + "epoch": 0.2, + "grad_norm": 0.5103448033332825, + "learning_rate": 0.000180076528328845, + "loss": 0.9119, + "step": 2420 + }, + { + "epoch": 0.2, + "grad_norm": 0.5514436364173889, + "learning_rate": 0.0001799969788187109, + "loss": 0.9128, + "step": 2425 + }, + { + "epoch": 0.21, + "grad_norm": 0.609643280506134, + "learning_rate": 0.0001799172884586568, + "loss": 0.8515, + "step": 2430 + }, + { + "epoch": 0.21, + "grad_norm": 0.82509446144104, + "learning_rate": 0.0001798374573889927, + "loss": 0.901, + "step": 2435 + }, + { + "epoch": 0.21, + "grad_norm": 0.563224196434021, + "learning_rate": 0.00017975748575027646, + "loss": 0.9706, + "step": 2440 + }, + { + "epoch": 0.21, + "grad_norm": 0.5628690123558044, + "learning_rate": 0.00017967737368331337, + "loss": 0.9594, + "step": 2445 + }, + { + "epoch": 0.21, + "grad_norm": 0.5404948592185974, + "learning_rate": 0.00017959712132915599, + "loss": 0.903, + "step": 2450 + }, + { + "epoch": 0.21, + "grad_norm": 0.6303789615631104, + "learning_rate": 0.00017951672882910385, + "loss": 0.9149, + "step": 2455 + }, + { + "epoch": 0.21, + "grad_norm": 0.8092197179794312, + "learning_rate": 0.0001794361963247033, + "loss": 0.9326, + "step": 2460 + }, + { + "epoch": 0.21, + "grad_norm": 0.5375030040740967, + "learning_rate": 0.00017935552395774708, + "loss": 0.973, + "step": 2465 + }, + { + "epoch": 0.21, + "grad_norm": 0.6692082285881042, + "learning_rate": 0.00017927471187027436, + "loss": 0.9744, + "step": 2470 + }, + { + "epoch": 0.21, + "grad_norm": 0.6109797358512878, + "learning_rate": 0.00017919376020457018, + "loss": 0.9621, + "step": 2475 + }, + { + "epoch": 0.21, + "grad_norm": 0.5392331480979919, + "learning_rate": 0.0001791126691031653, + "loss": 0.9281, + "step": 2480 + }, + { + "epoch": 0.21, + "grad_norm": 0.7013811469078064, + "learning_rate": 0.00017903143870883615, + "loss": 0.9098, + "step": 2485 + }, + { + "epoch": 0.21, + "grad_norm": 0.700690507888794, + "learning_rate": 0.00017895006916460426, + "loss": 0.9465, + "step": 2490 + }, + { + "epoch": 0.21, + "grad_norm": 0.6423932313919067, + "learning_rate": 0.00017886856061373623, + "loss": 0.891, + "step": 2495 + }, + { + "epoch": 0.21, + "grad_norm": 0.5714556574821472, + "learning_rate": 0.00017878691319974337, + "loss": 0.8927, + "step": 2500 + }, + { + "epoch": 0.21, + "grad_norm": 0.6294008493423462, + "learning_rate": 0.00017870512706638148, + "loss": 0.976, + "step": 2505 + }, + { + "epoch": 0.21, + "grad_norm": 0.6031877398490906, + "learning_rate": 0.0001786232023576507, + "loss": 0.9927, + "step": 2510 + }, + { + "epoch": 0.21, + "grad_norm": 0.5919615626335144, + "learning_rate": 0.00017854113921779509, + "loss": 1.0071, + "step": 2515 + }, + { + "epoch": 0.21, + "grad_norm": 0.5988855957984924, + "learning_rate": 0.00017845893779130237, + "loss": 0.8991, + "step": 2520 + }, + { + "epoch": 0.21, + "grad_norm": 0.7165807485580444, + "learning_rate": 0.00017837659822290386, + "loss": 1.0104, + "step": 2525 + }, + { + "epoch": 0.21, + "grad_norm": 0.5668060183525085, + "learning_rate": 0.00017829412065757398, + "loss": 1.0519, + "step": 2530 + }, + { + "epoch": 0.21, + "grad_norm": 0.5884309411048889, + "learning_rate": 0.0001782115052405303, + "loss": 0.9941, + "step": 2535 + }, + { + "epoch": 0.21, + "grad_norm": 0.6587290167808533, + "learning_rate": 0.00017812875211723291, + "loss": 0.936, + "step": 2540 + }, + { + "epoch": 0.21, + "grad_norm": 0.615807056427002, + "learning_rate": 0.00017804586143338455, + "loss": 0.9558, + "step": 2545 + }, + { + "epoch": 0.22, + "grad_norm": 0.6964926719665527, + "learning_rate": 0.00017796283333492997, + "loss": 1.0988, + "step": 2550 + }, + { + "epoch": 0.22, + "grad_norm": 0.6390440464019775, + "learning_rate": 0.00017787966796805596, + "loss": 0.979, + "step": 2555 + }, + { + "epoch": 0.22, + "grad_norm": 0.6075397729873657, + "learning_rate": 0.00017779636547919102, + "loss": 0.9705, + "step": 2560 + }, + { + "epoch": 0.22, + "grad_norm": 0.65053391456604, + "learning_rate": 0.00017771292601500505, + "loss": 1.0245, + "step": 2565 + }, + { + "epoch": 0.22, + "grad_norm": 0.5904353857040405, + "learning_rate": 0.00017762934972240913, + "loss": 0.8593, + "step": 2570 + }, + { + "epoch": 0.22, + "grad_norm": 0.6183270215988159, + "learning_rate": 0.0001775456367485552, + "loss": 0.8164, + "step": 2575 + }, + { + "epoch": 0.22, + "grad_norm": 0.5901939272880554, + "learning_rate": 0.00017746178724083593, + "loss": 0.8847, + "step": 2580 + }, + { + "epoch": 0.22, + "grad_norm": 0.6754553318023682, + "learning_rate": 0.00017737780134688435, + "loss": 1.0066, + "step": 2585 + }, + { + "epoch": 0.22, + "grad_norm": 0.5526779890060425, + "learning_rate": 0.00017729367921457363, + "loss": 0.9569, + "step": 2590 + }, + { + "epoch": 0.22, + "grad_norm": 0.618124783039093, + "learning_rate": 0.00017720942099201678, + "loss": 0.9416, + "step": 2595 + }, + { + "epoch": 0.22, + "grad_norm": 0.5879096388816833, + "learning_rate": 0.00017712502682756646, + "loss": 0.8886, + "step": 2600 + }, + { + "epoch": 0.22, + "grad_norm": 0.5963704586029053, + "learning_rate": 0.00017704049686981471, + "loss": 1.0272, + "step": 2605 + }, + { + "epoch": 0.22, + "grad_norm": 0.6259241104125977, + "learning_rate": 0.0001769558312675926, + "loss": 0.9514, + "step": 2610 + }, + { + "epoch": 0.22, + "grad_norm": 0.5682948231697083, + "learning_rate": 0.00017687103016997003, + "loss": 0.8188, + "step": 2615 + }, + { + "epoch": 0.22, + "grad_norm": 0.7589099407196045, + "learning_rate": 0.0001767860937262555, + "loss": 0.9704, + "step": 2620 + }, + { + "epoch": 0.22, + "grad_norm": 0.6059669256210327, + "learning_rate": 0.0001767010220859958, + "loss": 0.9102, + "step": 2625 + }, + { + "epoch": 0.22, + "grad_norm": 0.6229948997497559, + "learning_rate": 0.00017661581539897577, + "loss": 0.8566, + "step": 2630 + }, + { + "epoch": 0.22, + "grad_norm": 0.5671799778938293, + "learning_rate": 0.000176530473815218, + "loss": 0.9599, + "step": 2635 + }, + { + "epoch": 0.22, + "grad_norm": 0.593555748462677, + "learning_rate": 0.00017644499748498263, + "loss": 0.8292, + "step": 2640 + }, + { + "epoch": 0.22, + "grad_norm": 0.6068690419197083, + "learning_rate": 0.000176359386558767, + "loss": 0.8347, + "step": 2645 + }, + { + "epoch": 0.22, + "grad_norm": 0.4839940667152405, + "learning_rate": 0.00017627364118730544, + "loss": 0.7297, + "step": 2650 + }, + { + "epoch": 0.22, + "grad_norm": 0.6615285873413086, + "learning_rate": 0.00017618776152156901, + "loss": 0.989, + "step": 2655 + }, + { + "epoch": 0.22, + "grad_norm": 0.5995057821273804, + "learning_rate": 0.00017610174771276525, + "loss": 0.9066, + "step": 2660 + }, + { + "epoch": 0.23, + "grad_norm": 0.6151632070541382, + "learning_rate": 0.0001760155999123378, + "loss": 0.9305, + "step": 2665 + }, + { + "epoch": 0.23, + "grad_norm": 0.6653651595115662, + "learning_rate": 0.0001759293182719664, + "loss": 1.0602, + "step": 2670 + }, + { + "epoch": 0.23, + "grad_norm": 0.6078524589538574, + "learning_rate": 0.00017584290294356616, + "loss": 0.9534, + "step": 2675 + }, + { + "epoch": 0.23, + "grad_norm": 0.7024226188659668, + "learning_rate": 0.00017575635407928784, + "loss": 0.9684, + "step": 2680 + }, + { + "epoch": 0.23, + "grad_norm": 0.5981400012969971, + "learning_rate": 0.00017566967183151714, + "loss": 0.9116, + "step": 2685 + }, + { + "epoch": 0.23, + "grad_norm": 0.5973444581031799, + "learning_rate": 0.00017558285635287465, + "loss": 0.9494, + "step": 2690 + }, + { + "epoch": 0.23, + "grad_norm": 0.7531886696815491, + "learning_rate": 0.00017549590779621563, + "loss": 0.9766, + "step": 2695 + }, + { + "epoch": 0.23, + "grad_norm": 0.5628442168235779, + "learning_rate": 0.00017540882631462954, + "loss": 0.9382, + "step": 2700 + }, + { + "epoch": 0.23, + "grad_norm": 0.6290925741195679, + "learning_rate": 0.00017532161206143993, + "loss": 0.9072, + "step": 2705 + }, + { + "epoch": 0.23, + "grad_norm": 0.6227972507476807, + "learning_rate": 0.0001752342651902041, + "loss": 0.9144, + "step": 2710 + }, + { + "epoch": 0.23, + "grad_norm": 0.48317110538482666, + "learning_rate": 0.00017514678585471284, + "loss": 0.8375, + "step": 2715 + }, + { + "epoch": 0.23, + "grad_norm": 0.8648179769515991, + "learning_rate": 0.00017505917420899018, + "loss": 0.8783, + "step": 2720 + }, + { + "epoch": 0.23, + "grad_norm": 0.5906129479408264, + "learning_rate": 0.00017497143040729314, + "loss": 0.9017, + "step": 2725 + }, + { + "epoch": 0.23, + "grad_norm": 0.7248625159263611, + "learning_rate": 0.0001748835546041114, + "loss": 0.9519, + "step": 2730 + }, + { + "epoch": 0.23, + "grad_norm": 0.5871652364730835, + "learning_rate": 0.000174795546954167, + "loss": 0.9223, + "step": 2735 + }, + { + "epoch": 0.23, + "grad_norm": 0.6332212686538696, + "learning_rate": 0.00017470740761241422, + "loss": 0.8887, + "step": 2740 + }, + { + "epoch": 0.23, + "grad_norm": 0.6930819749832153, + "learning_rate": 0.00017461913673403915, + "loss": 1.0195, + "step": 2745 + }, + { + "epoch": 0.23, + "grad_norm": 0.4964215159416199, + "learning_rate": 0.00017453073447445952, + "loss": 0.8916, + "step": 2750 + }, + { + "epoch": 0.23, + "grad_norm": 0.6864855885505676, + "learning_rate": 0.0001744422009893243, + "loss": 0.8499, + "step": 2755 + }, + { + "epoch": 0.23, + "grad_norm": 0.6707122921943665, + "learning_rate": 0.00017435353643451357, + "loss": 0.9833, + "step": 2760 + }, + { + "epoch": 0.23, + "grad_norm": 0.6430404782295227, + "learning_rate": 0.00017426474096613812, + "loss": 0.9741, + "step": 2765 + }, + { + "epoch": 0.23, + "grad_norm": 0.5402218103408813, + "learning_rate": 0.00017417581474053938, + "loss": 0.8927, + "step": 2770 + }, + { + "epoch": 0.23, + "grad_norm": 0.6882984638214111, + "learning_rate": 0.00017408675791428886, + "loss": 1.0262, + "step": 2775 + }, + { + "epoch": 0.23, + "grad_norm": 0.5932244062423706, + "learning_rate": 0.00017399757064418805, + "loss": 0.8567, + "step": 2780 + }, + { + "epoch": 0.24, + "grad_norm": 0.6488876938819885, + "learning_rate": 0.00017390825308726817, + "loss": 1.122, + "step": 2785 + }, + { + "epoch": 0.24, + "grad_norm": 0.6389744281768799, + "learning_rate": 0.00017381880540078974, + "loss": 1.0426, + "step": 2790 + }, + { + "epoch": 0.24, + "grad_norm": 0.615460991859436, + "learning_rate": 0.0001737292277422425, + "loss": 0.9403, + "step": 2795 + }, + { + "epoch": 0.24, + "grad_norm": 0.5837079882621765, + "learning_rate": 0.000173639520269345, + "loss": 0.8836, + "step": 2800 + }, + { + "epoch": 0.24, + "grad_norm": 0.7517130374908447, + "learning_rate": 0.0001735496831400443, + "loss": 0.873, + "step": 2805 + }, + { + "epoch": 0.24, + "grad_norm": 0.7326067686080933, + "learning_rate": 0.00017345971651251576, + "loss": 1.0026, + "step": 2810 + }, + { + "epoch": 0.24, + "grad_norm": 0.6494262218475342, + "learning_rate": 0.00017336962054516277, + "loss": 0.9344, + "step": 2815 + }, + { + "epoch": 0.24, + "grad_norm": 0.7557472586631775, + "learning_rate": 0.0001732793953966165, + "loss": 0.8161, + "step": 2820 + }, + { + "epoch": 0.24, + "grad_norm": 0.6807621717453003, + "learning_rate": 0.00017318904122573542, + "loss": 0.952, + "step": 2825 + }, + { + "epoch": 0.24, + "grad_norm": 0.6098347902297974, + "learning_rate": 0.00017309855819160535, + "loss": 0.8668, + "step": 2830 + }, + { + "epoch": 0.24, + "grad_norm": 0.4934307038784027, + "learning_rate": 0.00017300794645353884, + "loss": 0.914, + "step": 2835 + }, + { + "epoch": 0.24, + "grad_norm": 0.6054341197013855, + "learning_rate": 0.00017291720617107516, + "loss": 0.9653, + "step": 2840 + }, + { + "epoch": 0.24, + "grad_norm": 0.5854093432426453, + "learning_rate": 0.00017282633750397984, + "loss": 0.9033, + "step": 2845 + }, + { + "epoch": 0.24, + "grad_norm": 0.6354621648788452, + "learning_rate": 0.0001727353406122445, + "loss": 1.0004, + "step": 2850 + }, + { + "epoch": 0.24, + "grad_norm": 0.7072866559028625, + "learning_rate": 0.00017264421565608648, + "loss": 1.042, + "step": 2855 + }, + { + "epoch": 0.24, + "grad_norm": 0.6686625480651855, + "learning_rate": 0.00017255296279594862, + "loss": 0.984, + "step": 2860 + }, + { + "epoch": 0.24, + "grad_norm": 0.6095872521400452, + "learning_rate": 0.000172461582192499, + "loss": 0.9247, + "step": 2865 + }, + { + "epoch": 0.24, + "grad_norm": 0.5870845317840576, + "learning_rate": 0.00017237007400663053, + "loss": 0.8918, + "step": 2870 + }, + { + "epoch": 0.24, + "grad_norm": 0.6446424722671509, + "learning_rate": 0.0001722784383994608, + "loss": 0.9891, + "step": 2875 + }, + { + "epoch": 0.24, + "grad_norm": 0.598726212978363, + "learning_rate": 0.00017218667553233182, + "loss": 0.9481, + "step": 2880 + }, + { + "epoch": 0.24, + "grad_norm": 0.5482841730117798, + "learning_rate": 0.00017209478556680957, + "loss": 0.9311, + "step": 2885 + }, + { + "epoch": 0.24, + "grad_norm": 0.6305568814277649, + "learning_rate": 0.00017200276866468375, + "loss": 0.8996, + "step": 2890 + }, + { + "epoch": 0.24, + "grad_norm": 0.4958514869213104, + "learning_rate": 0.0001719106249879678, + "loss": 0.8383, + "step": 2895 + }, + { + "epoch": 0.24, + "grad_norm": 0.7220568060874939, + "learning_rate": 0.00017181835469889812, + "loss": 0.935, + "step": 2900 + }, + { + "epoch": 0.25, + "grad_norm": 0.6705941557884216, + "learning_rate": 0.00017172595795993413, + "loss": 0.9062, + "step": 2905 + }, + { + "epoch": 0.25, + "grad_norm": 0.5938014984130859, + "learning_rate": 0.0001716334349337579, + "loss": 0.8727, + "step": 2910 + }, + { + "epoch": 0.25, + "grad_norm": 0.7060703039169312, + "learning_rate": 0.00017154078578327387, + "loss": 0.9586, + "step": 2915 + }, + { + "epoch": 0.25, + "grad_norm": 0.6116676926612854, + "learning_rate": 0.00017144801067160844, + "loss": 0.9642, + "step": 2920 + }, + { + "epoch": 0.25, + "grad_norm": 0.5843990445137024, + "learning_rate": 0.00017135510976211, + "loss": 0.8538, + "step": 2925 + }, + { + "epoch": 0.25, + "grad_norm": 0.6299535632133484, + "learning_rate": 0.0001712620832183482, + "loss": 0.9251, + "step": 2930 + }, + { + "epoch": 0.25, + "grad_norm": 0.4945293664932251, + "learning_rate": 0.00017116893120411398, + "loss": 0.952, + "step": 2935 + }, + { + "epoch": 0.25, + "grad_norm": 0.6320123672485352, + "learning_rate": 0.00017107565388341925, + "loss": 0.8861, + "step": 2940 + }, + { + "epoch": 0.25, + "grad_norm": 0.7203616499900818, + "learning_rate": 0.0001709822514204965, + "loss": 0.9179, + "step": 2945 + }, + { + "epoch": 0.25, + "grad_norm": 0.6014286279678345, + "learning_rate": 0.00017088872397979854, + "loss": 0.9257, + "step": 2950 + }, + { + "epoch": 0.25, + "grad_norm": 0.6653793454170227, + "learning_rate": 0.00017079507172599828, + "loss": 0.971, + "step": 2955 + }, + { + "epoch": 0.25, + "grad_norm": 0.5894360542297363, + "learning_rate": 0.00017070129482398832, + "loss": 0.9119, + "step": 2960 + }, + { + "epoch": 0.25, + "grad_norm": 0.662100076675415, + "learning_rate": 0.00017060739343888076, + "loss": 0.935, + "step": 2965 + }, + { + "epoch": 0.25, + "grad_norm": 0.8383188247680664, + "learning_rate": 0.00017051336773600686, + "loss": 0.9977, + "step": 2970 + }, + { + "epoch": 0.25, + "grad_norm": 0.6464200019836426, + "learning_rate": 0.00017041921788091684, + "loss": 0.9875, + "step": 2975 + }, + { + "epoch": 0.25, + "grad_norm": 0.670562744140625, + "learning_rate": 0.0001703249440393794, + "loss": 1.0091, + "step": 2980 + }, + { + "epoch": 0.25, + "grad_norm": 0.6573236584663391, + "learning_rate": 0.0001702305463773816, + "loss": 0.8405, + "step": 2985 + }, + { + "epoch": 0.25, + "grad_norm": 0.6789873838424683, + "learning_rate": 0.00017013602506112853, + "loss": 0.8046, + "step": 2990 + }, + { + "epoch": 0.25, + "grad_norm": 0.6313668489456177, + "learning_rate": 0.00017004138025704298, + "loss": 0.9618, + "step": 2995 + }, + { + "epoch": 0.25, + "grad_norm": 0.6663994193077087, + "learning_rate": 0.00016994661213176512, + "loss": 0.9682, + "step": 3000 + }, + { + "epoch": 0.25, + "grad_norm": 0.6564815044403076, + "learning_rate": 0.00016985172085215235, + "loss": 0.9292, + "step": 3005 + }, + { + "epoch": 0.25, + "grad_norm": 0.5784175992012024, + "learning_rate": 0.00016975670658527875, + "loss": 0.9443, + "step": 3010 + }, + { + "epoch": 0.25, + "grad_norm": 0.6862292289733887, + "learning_rate": 0.00016966156949843513, + "loss": 0.8598, + "step": 3015 + }, + { + "epoch": 0.26, + "grad_norm": 0.5390055179595947, + "learning_rate": 0.0001695663097591284, + "loss": 0.8773, + "step": 3020 + }, + { + "epoch": 0.26, + "grad_norm": 0.6282461881637573, + "learning_rate": 0.00016947092753508147, + "loss": 0.9229, + "step": 3025 + }, + { + "epoch": 0.26, + "grad_norm": 0.6688674688339233, + "learning_rate": 0.00016937542299423294, + "loss": 0.9077, + "step": 3030 + }, + { + "epoch": 0.26, + "grad_norm": 0.5949118733406067, + "learning_rate": 0.00016927979630473677, + "loss": 0.9493, + "step": 3035 + }, + { + "epoch": 0.26, + "grad_norm": 0.5177481174468994, + "learning_rate": 0.0001691840476349619, + "loss": 0.9389, + "step": 3040 + }, + { + "epoch": 0.26, + "grad_norm": 0.7047688364982605, + "learning_rate": 0.00016908817715349217, + "loss": 0.9878, + "step": 3045 + }, + { + "epoch": 0.26, + "grad_norm": 0.5326054096221924, + "learning_rate": 0.00016899218502912578, + "loss": 0.8119, + "step": 3050 + }, + { + "epoch": 0.26, + "grad_norm": 0.6114538908004761, + "learning_rate": 0.00016889607143087516, + "loss": 1.0068, + "step": 3055 + }, + { + "epoch": 0.26, + "grad_norm": 0.6147065758705139, + "learning_rate": 0.0001687998365279666, + "loss": 1.0051, + "step": 3060 + }, + { + "epoch": 0.26, + "grad_norm": 0.6280825138092041, + "learning_rate": 0.00016870348048984, + "loss": 0.9755, + "step": 3065 + }, + { + "epoch": 0.26, + "grad_norm": 0.7574710845947266, + "learning_rate": 0.0001686070034861485, + "loss": 0.7589, + "step": 3070 + }, + { + "epoch": 0.26, + "grad_norm": 0.6277915835380554, + "learning_rate": 0.0001685104056867583, + "loss": 0.9468, + "step": 3075 + }, + { + "epoch": 0.26, + "grad_norm": 0.6549091935157776, + "learning_rate": 0.00016841368726174812, + "loss": 0.9123, + "step": 3080 + }, + { + "epoch": 0.26, + "grad_norm": 0.6778223514556885, + "learning_rate": 0.00016831684838140927, + "loss": 0.9532, + "step": 3085 + }, + { + "epoch": 0.26, + "grad_norm": 0.5268427133560181, + "learning_rate": 0.00016821988921624499, + "loss": 0.8781, + "step": 3090 + }, + { + "epoch": 0.26, + "grad_norm": 0.6663500070571899, + "learning_rate": 0.00016812280993697037, + "loss": 0.9117, + "step": 3095 + }, + { + "epoch": 0.26, + "grad_norm": 0.6826052665710449, + "learning_rate": 0.000168025610714512, + "loss": 0.965, + "step": 3100 + }, + { + "epoch": 0.26, + "grad_norm": 0.6532934904098511, + "learning_rate": 0.0001679282917200076, + "loss": 0.9292, + "step": 3105 + }, + { + "epoch": 0.26, + "grad_norm": 0.6209408640861511, + "learning_rate": 0.00016783085312480585, + "loss": 1.0588, + "step": 3110 + }, + { + "epoch": 0.26, + "grad_norm": 0.6071512699127197, + "learning_rate": 0.00016773329510046586, + "loss": 1.0328, + "step": 3115 + }, + { + "epoch": 0.26, + "grad_norm": 0.6279392242431641, + "learning_rate": 0.0001676356178187572, + "loss": 0.9559, + "step": 3120 + }, + { + "epoch": 0.26, + "grad_norm": 0.5424745678901672, + "learning_rate": 0.0001675378214516593, + "loss": 0.9844, + "step": 3125 + }, + { + "epoch": 0.26, + "grad_norm": 0.74262535572052, + "learning_rate": 0.00016743990617136128, + "loss": 1.0874, + "step": 3130 + }, + { + "epoch": 0.26, + "grad_norm": 0.5871965885162354, + "learning_rate": 0.00016734187215026167, + "loss": 1.0176, + "step": 3135 + }, + { + "epoch": 0.27, + "grad_norm": 0.5088030099868774, + "learning_rate": 0.000167243719560968, + "loss": 0.8139, + "step": 3140 + }, + { + "epoch": 0.27, + "grad_norm": 0.6284034252166748, + "learning_rate": 0.00016714544857629666, + "loss": 0.9315, + "step": 3145 + }, + { + "epoch": 0.27, + "grad_norm": 0.7349436283111572, + "learning_rate": 0.00016704705936927244, + "loss": 0.8626, + "step": 3150 + }, + { + "epoch": 0.27, + "grad_norm": 1.0078330039978027, + "learning_rate": 0.00016694855211312818, + "loss": 1.0201, + "step": 3155 + }, + { + "epoch": 0.27, + "grad_norm": 0.7132492065429688, + "learning_rate": 0.00016684992698130476, + "loss": 0.938, + "step": 3160 + }, + { + "epoch": 0.27, + "grad_norm": 0.6642095446586609, + "learning_rate": 0.00016675118414745052, + "loss": 0.897, + "step": 3165 + }, + { + "epoch": 0.27, + "grad_norm": 0.5875717401504517, + "learning_rate": 0.000166652323785421, + "loss": 0.9306, + "step": 3170 + }, + { + "epoch": 0.27, + "grad_norm": 0.5877960324287415, + "learning_rate": 0.00016655334606927865, + "loss": 0.9407, + "step": 3175 + }, + { + "epoch": 0.27, + "grad_norm": 0.7100574374198914, + "learning_rate": 0.00016645425117329268, + "loss": 1.0241, + "step": 3180 + }, + { + "epoch": 0.27, + "grad_norm": 0.564652144908905, + "learning_rate": 0.0001663550392719385, + "loss": 0.9337, + "step": 3185 + }, + { + "epoch": 0.27, + "grad_norm": 0.5287343263626099, + "learning_rate": 0.00016625571053989754, + "loss": 0.8411, + "step": 3190 + }, + { + "epoch": 0.27, + "grad_norm": 0.5721025466918945, + "learning_rate": 0.00016615626515205695, + "loss": 0.8796, + "step": 3195 + }, + { + "epoch": 0.27, + "grad_norm": 0.8463550209999084, + "learning_rate": 0.00016605670328350932, + "loss": 0.9973, + "step": 3200 + }, + { + "epoch": 0.27, + "grad_norm": 0.6993775367736816, + "learning_rate": 0.00016595702510955227, + "loss": 1.0342, + "step": 3205 + }, + { + "epoch": 0.27, + "grad_norm": 0.7446513175964355, + "learning_rate": 0.00016585723080568817, + "loss": 0.874, + "step": 3210 + }, + { + "epoch": 0.27, + "grad_norm": 0.6518576145172119, + "learning_rate": 0.00016575732054762397, + "loss": 0.8751, + "step": 3215 + }, + { + "epoch": 0.27, + "grad_norm": 0.6227362155914307, + "learning_rate": 0.00016565729451127067, + "loss": 0.8898, + "step": 3220 + }, + { + "epoch": 0.27, + "grad_norm": 0.6903109550476074, + "learning_rate": 0.00016555715287274318, + "loss": 0.8295, + "step": 3225 + }, + { + "epoch": 0.27, + "grad_norm": 0.5956553816795349, + "learning_rate": 0.00016545689580835994, + "loss": 0.9709, + "step": 3230 + }, + { + "epoch": 0.27, + "grad_norm": 0.633112370967865, + "learning_rate": 0.00016535652349464254, + "loss": 0.8406, + "step": 3235 + }, + { + "epoch": 0.27, + "grad_norm": 0.5930719375610352, + "learning_rate": 0.00016525603610831566, + "loss": 0.857, + "step": 3240 + }, + { + "epoch": 0.27, + "grad_norm": 0.717291533946991, + "learning_rate": 0.0001651554338263064, + "loss": 1.0251, + "step": 3245 + }, + { + "epoch": 0.27, + "grad_norm": 0.7223893404006958, + "learning_rate": 0.0001650547168257443, + "loss": 0.8636, + "step": 3250 + }, + { + "epoch": 0.27, + "grad_norm": 0.6896404027938843, + "learning_rate": 0.0001649538852839608, + "loss": 0.9304, + "step": 3255 + }, + { + "epoch": 0.28, + "grad_norm": 0.7068783640861511, + "learning_rate": 0.00016485293937848903, + "loss": 0.9012, + "step": 3260 + }, + { + "epoch": 0.28, + "grad_norm": 0.8663097620010376, + "learning_rate": 0.0001647518792870635, + "loss": 1.0295, + "step": 3265 + }, + { + "epoch": 0.28, + "grad_norm": 0.7634738683700562, + "learning_rate": 0.00016465070518761977, + "loss": 0.9565, + "step": 3270 + }, + { + "epoch": 0.28, + "grad_norm": 0.6867614388465881, + "learning_rate": 0.00016454941725829405, + "loss": 0.9136, + "step": 3275 + }, + { + "epoch": 0.28, + "grad_norm": 0.6606219410896301, + "learning_rate": 0.0001644480156774231, + "loss": 0.8807, + "step": 3280 + }, + { + "epoch": 0.28, + "grad_norm": 0.6782506108283997, + "learning_rate": 0.0001643465006235437, + "loss": 0.9843, + "step": 3285 + }, + { + "epoch": 0.28, + "grad_norm": 0.6689923405647278, + "learning_rate": 0.00016424487227539243, + "loss": 0.9309, + "step": 3290 + }, + { + "epoch": 0.28, + "grad_norm": 0.5364828705787659, + "learning_rate": 0.00016414313081190537, + "loss": 0.8986, + "step": 3295 + }, + { + "epoch": 0.28, + "grad_norm": 0.6805548071861267, + "learning_rate": 0.00016404127641221774, + "loss": 0.8756, + "step": 3300 + }, + { + "epoch": 0.28, + "grad_norm": 0.6093565821647644, + "learning_rate": 0.00016393930925566358, + "loss": 0.9902, + "step": 3305 + }, + { + "epoch": 0.28, + "grad_norm": 0.6762187480926514, + "learning_rate": 0.00016383722952177557, + "loss": 0.9862, + "step": 3310 + }, + { + "epoch": 0.28, + "grad_norm": 0.6144864559173584, + "learning_rate": 0.00016373503739028448, + "loss": 1.0414, + "step": 3315 + }, + { + "epoch": 0.28, + "grad_norm": 0.8119445443153381, + "learning_rate": 0.00016363273304111902, + "loss": 0.9946, + "step": 3320 + }, + { + "epoch": 0.28, + "grad_norm": 0.6898847818374634, + "learning_rate": 0.00016353031665440547, + "loss": 0.8921, + "step": 3325 + }, + { + "epoch": 0.28, + "grad_norm": 0.6010476350784302, + "learning_rate": 0.00016342778841046745, + "loss": 0.9369, + "step": 3330 + }, + { + "epoch": 0.28, + "grad_norm": 0.4694141149520874, + "learning_rate": 0.00016332514848982542, + "loss": 0.8308, + "step": 3335 + }, + { + "epoch": 0.28, + "grad_norm": 0.6777420043945312, + "learning_rate": 0.00016322239707319648, + "loss": 0.9351, + "step": 3340 + }, + { + "epoch": 0.28, + "grad_norm": 0.770054280757904, + "learning_rate": 0.00016311953434149413, + "loss": 0.8852, + "step": 3345 + }, + { + "epoch": 0.28, + "grad_norm": 0.6395520567893982, + "learning_rate": 0.0001630165604758278, + "loss": 0.8646, + "step": 3350 + }, + { + "epoch": 0.28, + "grad_norm": 0.6607555150985718, + "learning_rate": 0.00016291347565750255, + "loss": 0.8932, + "step": 3355 + }, + { + "epoch": 0.28, + "grad_norm": 0.5776294469833374, + "learning_rate": 0.00016281028006801887, + "loss": 0.9063, + "step": 3360 + }, + { + "epoch": 0.28, + "grad_norm": 0.5314105153083801, + "learning_rate": 0.0001627069738890723, + "loss": 0.8283, + "step": 3365 + }, + { + "epoch": 0.28, + "grad_norm": 0.6023496389389038, + "learning_rate": 0.00016260355730255297, + "loss": 0.8207, + "step": 3370 + }, + { + "epoch": 0.29, + "grad_norm": 0.714431881904602, + "learning_rate": 0.0001625000304905455, + "loss": 0.9015, + "step": 3375 + }, + { + "epoch": 0.29, + "grad_norm": 0.6060982346534729, + "learning_rate": 0.00016239639363532858, + "loss": 0.9208, + "step": 3380 + }, + { + "epoch": 0.29, + "grad_norm": 0.6184629797935486, + "learning_rate": 0.00016229264691937462, + "loss": 0.9031, + "step": 3385 + }, + { + "epoch": 0.29, + "grad_norm": 0.6056995391845703, + "learning_rate": 0.00016218879052534949, + "loss": 1.0191, + "step": 3390 + }, + { + "epoch": 0.29, + "grad_norm": 0.6588426232337952, + "learning_rate": 0.0001620848246361122, + "loss": 0.9528, + "step": 3395 + }, + { + "epoch": 0.29, + "grad_norm": 0.6407586932182312, + "learning_rate": 0.0001619807494347144, + "loss": 0.8039, + "step": 3400 + }, + { + "epoch": 0.29, + "grad_norm": 0.6428894400596619, + "learning_rate": 0.0001618765651044004, + "loss": 0.8567, + "step": 3405 + }, + { + "epoch": 0.29, + "grad_norm": 0.7074903249740601, + "learning_rate": 0.00016177227182860647, + "loss": 0.8954, + "step": 3410 + }, + { + "epoch": 0.29, + "grad_norm": 0.6333289742469788, + "learning_rate": 0.00016166786979096088, + "loss": 0.8844, + "step": 3415 + }, + { + "epoch": 0.29, + "grad_norm": 0.5690221786499023, + "learning_rate": 0.00016156335917528325, + "loss": 0.8021, + "step": 3420 + }, + { + "epoch": 0.29, + "grad_norm": 0.6526438593864441, + "learning_rate": 0.00016145874016558443, + "loss": 0.9289, + "step": 3425 + }, + { + "epoch": 0.29, + "grad_norm": 0.6392170786857605, + "learning_rate": 0.00016135401294606618, + "loss": 0.9413, + "step": 3430 + }, + { + "epoch": 0.29, + "grad_norm": 0.8693280816078186, + "learning_rate": 0.0001612491777011206, + "loss": 0.7817, + "step": 3435 + }, + { + "epoch": 0.29, + "grad_norm": 0.7930571436882019, + "learning_rate": 0.00016114423461533026, + "loss": 0.9878, + "step": 3440 + }, + { + "epoch": 0.29, + "grad_norm": 0.5343663692474365, + "learning_rate": 0.00016103918387346732, + "loss": 0.8839, + "step": 3445 + }, + { + "epoch": 0.29, + "grad_norm": 0.718345046043396, + "learning_rate": 0.00016093402566049367, + "loss": 1.0465, + "step": 3450 + }, + { + "epoch": 0.29, + "grad_norm": 0.5717639327049255, + "learning_rate": 0.0001608287601615604, + "loss": 0.8313, + "step": 3455 + }, + { + "epoch": 0.29, + "grad_norm": 0.6372225880622864, + "learning_rate": 0.00016072338756200746, + "loss": 0.9464, + "step": 3460 + }, + { + "epoch": 0.29, + "grad_norm": 0.5836284756660461, + "learning_rate": 0.00016061790804736332, + "loss": 0.9665, + "step": 3465 + }, + { + "epoch": 0.29, + "grad_norm": 0.7392018437385559, + "learning_rate": 0.00016051232180334485, + "loss": 0.8797, + "step": 3470 + }, + { + "epoch": 0.29, + "grad_norm": 0.6039625406265259, + "learning_rate": 0.00016040662901585674, + "loss": 0.9242, + "step": 3475 + }, + { + "epoch": 0.29, + "grad_norm": 0.5853233337402344, + "learning_rate": 0.00016030082987099123, + "loss": 0.9494, + "step": 3480 + }, + { + "epoch": 0.29, + "grad_norm": 0.6518459320068359, + "learning_rate": 0.00016019492455502787, + "loss": 0.9992, + "step": 3485 + }, + { + "epoch": 0.29, + "grad_norm": 0.6406946182250977, + "learning_rate": 0.00016008891325443317, + "loss": 0.9105, + "step": 3490 + }, + { + "epoch": 0.3, + "grad_norm": 0.7694531083106995, + "learning_rate": 0.0001599827961558602, + "loss": 0.9901, + "step": 3495 + }, + { + "epoch": 0.3, + "grad_norm": 0.8471524715423584, + "learning_rate": 0.00015987657344614835, + "loss": 0.8733, + "step": 3500 + }, + { + "epoch": 0.3, + "grad_norm": 0.6725741028785706, + "learning_rate": 0.0001597702453123229, + "loss": 0.9278, + "step": 3505 + }, + { + "epoch": 0.3, + "grad_norm": 0.6549644470214844, + "learning_rate": 0.00015966381194159482, + "loss": 0.9492, + "step": 3510 + }, + { + "epoch": 0.3, + "grad_norm": 0.6407148838043213, + "learning_rate": 0.0001595572735213603, + "loss": 0.8639, + "step": 3515 + }, + { + "epoch": 0.3, + "grad_norm": 0.677624523639679, + "learning_rate": 0.00015945063023920056, + "loss": 1.0831, + "step": 3520 + }, + { + "epoch": 0.3, + "grad_norm": 0.5648879408836365, + "learning_rate": 0.00015934388228288138, + "loss": 0.8247, + "step": 3525 + }, + { + "epoch": 0.3, + "grad_norm": 0.7800204753875732, + "learning_rate": 0.00015923702984035288, + "loss": 0.9129, + "step": 3530 + }, + { + "epoch": 0.3, + "grad_norm": 0.624596357345581, + "learning_rate": 0.00015913007309974916, + "loss": 0.9534, + "step": 3535 + }, + { + "epoch": 0.3, + "grad_norm": 0.7033581733703613, + "learning_rate": 0.00015902301224938792, + "loss": 0.7902, + "step": 3540 + }, + { + "epoch": 0.3, + "grad_norm": 0.7332955598831177, + "learning_rate": 0.00015891584747777018, + "loss": 0.9685, + "step": 3545 + }, + { + "epoch": 0.3, + "grad_norm": 0.7487533092498779, + "learning_rate": 0.00015880857897357994, + "loss": 0.9143, + "step": 3550 + }, + { + "epoch": 0.3, + "grad_norm": 0.6748859286308289, + "learning_rate": 0.00015870120692568383, + "loss": 0.9375, + "step": 3555 + }, + { + "epoch": 0.3, + "grad_norm": 0.6206909418106079, + "learning_rate": 0.00015859373152313078, + "loss": 0.9535, + "step": 3560 + }, + { + "epoch": 0.3, + "grad_norm": 0.6313974857330322, + "learning_rate": 0.00015848615295515175, + "loss": 0.8325, + "step": 3565 + }, + { + "epoch": 0.3, + "grad_norm": 0.5907221436500549, + "learning_rate": 0.00015837847141115927, + "loss": 0.8503, + "step": 3570 + }, + { + "epoch": 0.3, + "grad_norm": 0.6586530804634094, + "learning_rate": 0.00015827068708074724, + "loss": 0.8813, + "step": 3575 + }, + { + "epoch": 0.3, + "grad_norm": 0.8002377152442932, + "learning_rate": 0.00015816280015369045, + "loss": 0.9446, + "step": 3580 + }, + { + "epoch": 0.3, + "grad_norm": 0.6264663934707642, + "learning_rate": 0.00015805481081994444, + "loss": 1.0207, + "step": 3585 + }, + { + "epoch": 0.3, + "grad_norm": 0.6522766351699829, + "learning_rate": 0.00015794671926964497, + "loss": 0.8568, + "step": 3590 + }, + { + "epoch": 0.3, + "grad_norm": 0.5466406941413879, + "learning_rate": 0.00015783852569310785, + "loss": 0.8827, + "step": 3595 + }, + { + "epoch": 0.3, + "grad_norm": 0.5301129817962646, + "learning_rate": 0.00015773023028082842, + "loss": 0.8395, + "step": 3600 + }, + { + "epoch": 0.3, + "grad_norm": 0.6852278709411621, + "learning_rate": 0.00015762183322348144, + "loss": 0.9331, + "step": 3605 + }, + { + "epoch": 0.3, + "grad_norm": 0.7991815209388733, + "learning_rate": 0.0001575133347119205, + "loss": 1.0256, + "step": 3610 + }, + { + "epoch": 0.31, + "grad_norm": 0.6975600123405457, + "learning_rate": 0.00015740473493717802, + "loss": 0.8432, + "step": 3615 + }, + { + "epoch": 0.31, + "grad_norm": 0.5848995447158813, + "learning_rate": 0.00015729603409046447, + "loss": 0.9775, + "step": 3620 + }, + { + "epoch": 0.31, + "grad_norm": 0.5751985311508179, + "learning_rate": 0.00015718723236316846, + "loss": 0.9224, + "step": 3625 + }, + { + "epoch": 0.31, + "grad_norm": 0.7802300453186035, + "learning_rate": 0.0001570783299468562, + "loss": 0.9733, + "step": 3630 + }, + { + "epoch": 0.31, + "grad_norm": 0.5258293747901917, + "learning_rate": 0.000156969327033271, + "loss": 0.8185, + "step": 3635 + }, + { + "epoch": 0.31, + "grad_norm": 0.6269134283065796, + "learning_rate": 0.00015686022381433337, + "loss": 0.899, + "step": 3640 + }, + { + "epoch": 0.31, + "grad_norm": 0.5452288389205933, + "learning_rate": 0.00015675102048214027, + "loss": 0.8696, + "step": 3645 + }, + { + "epoch": 0.31, + "grad_norm": 0.6281285285949707, + "learning_rate": 0.000156641717228965, + "loss": 0.9534, + "step": 3650 + }, + { + "epoch": 0.31, + "grad_norm": 0.8480727076530457, + "learning_rate": 0.00015653231424725671, + "loss": 0.9262, + "step": 3655 + }, + { + "epoch": 0.31, + "grad_norm": 0.603663444519043, + "learning_rate": 0.00015642281172964024, + "loss": 0.8537, + "step": 3660 + }, + { + "epoch": 0.31, + "grad_norm": 0.5591794848442078, + "learning_rate": 0.0001563132098689156, + "loss": 0.8872, + "step": 3665 + }, + { + "epoch": 0.31, + "grad_norm": 0.6126306056976318, + "learning_rate": 0.00015620350885805774, + "loss": 0.833, + "step": 3670 + }, + { + "epoch": 0.31, + "grad_norm": 0.6492719054222107, + "learning_rate": 0.00015609370889021617, + "loss": 0.923, + "step": 3675 + }, + { + "epoch": 0.31, + "grad_norm": 0.6563048958778381, + "learning_rate": 0.00015598381015871472, + "loss": 0.8586, + "step": 3680 + }, + { + "epoch": 0.31, + "grad_norm": 0.602532684803009, + "learning_rate": 0.000155873812857051, + "loss": 0.81, + "step": 3685 + }, + { + "epoch": 0.31, + "grad_norm": 0.5984077453613281, + "learning_rate": 0.0001557637171788962, + "loss": 0.8112, + "step": 3690 + }, + { + "epoch": 0.31, + "grad_norm": 0.6583290100097656, + "learning_rate": 0.00015565352331809473, + "loss": 0.8635, + "step": 3695 + }, + { + "epoch": 0.31, + "grad_norm": 0.6902117729187012, + "learning_rate": 0.0001555432314686639, + "loss": 0.8655, + "step": 3700 + }, + { + "epoch": 0.31, + "grad_norm": 0.6870086789131165, + "learning_rate": 0.00015543284182479352, + "loss": 0.9489, + "step": 3705 + }, + { + "epoch": 0.31, + "grad_norm": 0.6190910935401917, + "learning_rate": 0.00015532235458084554, + "loss": 0.8847, + "step": 3710 + }, + { + "epoch": 0.31, + "grad_norm": 0.6703643798828125, + "learning_rate": 0.00015521176993135388, + "loss": 0.9021, + "step": 3715 + }, + { + "epoch": 0.31, + "grad_norm": 0.6639297604560852, + "learning_rate": 0.00015510108807102383, + "loss": 0.9294, + "step": 3720 + }, + { + "epoch": 0.31, + "grad_norm": 0.6397198438644409, + "learning_rate": 0.00015499030919473186, + "loss": 0.9062, + "step": 3725 + }, + { + "epoch": 0.32, + "grad_norm": 0.7036738991737366, + "learning_rate": 0.00015487943349752533, + "loss": 0.922, + "step": 3730 + }, + { + "epoch": 0.32, + "grad_norm": 0.5168409943580627, + "learning_rate": 0.00015476846117462204, + "loss": 0.9515, + "step": 3735 + }, + { + "epoch": 0.32, + "grad_norm": 0.6692652106285095, + "learning_rate": 0.00015465739242140987, + "loss": 0.8807, + "step": 3740 + }, + { + "epoch": 0.32, + "grad_norm": 0.6560524702072144, + "learning_rate": 0.0001545462274334465, + "loss": 0.9095, + "step": 3745 + }, + { + "epoch": 0.32, + "grad_norm": 0.5958166122436523, + "learning_rate": 0.00015443496640645915, + "loss": 0.9193, + "step": 3750 + }, + { + "epoch": 0.32, + "grad_norm": 0.6260406970977783, + "learning_rate": 0.00015432360953634397, + "loss": 0.9808, + "step": 3755 + }, + { + "epoch": 0.32, + "grad_norm": 0.8512209057807922, + "learning_rate": 0.00015421215701916596, + "loss": 0.964, + "step": 3760 + }, + { + "epoch": 0.32, + "grad_norm": 0.8193923234939575, + "learning_rate": 0.00015410060905115852, + "loss": 0.8987, + "step": 3765 + }, + { + "epoch": 0.32, + "grad_norm": 0.6861628890037537, + "learning_rate": 0.0001539889658287231, + "loss": 0.9304, + "step": 3770 + }, + { + "epoch": 0.32, + "grad_norm": 0.6048279404640198, + "learning_rate": 0.00015387722754842885, + "loss": 0.9073, + "step": 3775 + }, + { + "epoch": 0.32, + "grad_norm": 0.7512372732162476, + "learning_rate": 0.0001537653944070123, + "loss": 0.979, + "step": 3780 + }, + { + "epoch": 0.32, + "grad_norm": 0.6055505871772766, + "learning_rate": 0.00015365346660137702, + "loss": 0.9774, + "step": 3785 + }, + { + "epoch": 0.32, + "grad_norm": 0.6355717778205872, + "learning_rate": 0.0001535414443285932, + "loss": 0.8838, + "step": 3790 + }, + { + "epoch": 0.32, + "grad_norm": 0.7207841873168945, + "learning_rate": 0.0001534293277858974, + "loss": 0.8993, + "step": 3795 + }, + { + "epoch": 0.32, + "grad_norm": 0.6495758891105652, + "learning_rate": 0.00015331711717069216, + "loss": 0.9091, + "step": 3800 + }, + { + "epoch": 0.32, + "grad_norm": 0.6666922569274902, + "learning_rate": 0.0001532048126805456, + "loss": 1.0268, + "step": 3805 + }, + { + "epoch": 0.32, + "grad_norm": 0.6077538132667542, + "learning_rate": 0.00015309241451319126, + "loss": 0.8616, + "step": 3810 + }, + { + "epoch": 0.32, + "grad_norm": 0.6129001975059509, + "learning_rate": 0.00015297992286652745, + "loss": 0.93, + "step": 3815 + }, + { + "epoch": 0.32, + "grad_norm": 0.7638548612594604, + "learning_rate": 0.0001528673379386172, + "loss": 1.01, + "step": 3820 + }, + { + "epoch": 0.32, + "grad_norm": 0.5867222547531128, + "learning_rate": 0.0001527546599276876, + "loss": 0.9305, + "step": 3825 + }, + { + "epoch": 0.32, + "grad_norm": 0.5881127119064331, + "learning_rate": 0.00015264188903212991, + "loss": 0.83, + "step": 3830 + }, + { + "epoch": 0.32, + "grad_norm": 0.8406863808631897, + "learning_rate": 0.00015252902545049866, + "loss": 0.9555, + "step": 3835 + }, + { + "epoch": 0.32, + "grad_norm": 0.6948667764663696, + "learning_rate": 0.00015241606938151177, + "loss": 0.8267, + "step": 3840 + }, + { + "epoch": 0.32, + "grad_norm": 0.7978765368461609, + "learning_rate": 0.00015230302102404986, + "loss": 0.9554, + "step": 3845 + }, + { + "epoch": 0.33, + "grad_norm": 0.6019850969314575, + "learning_rate": 0.0001521898805771561, + "loss": 1.0175, + "step": 3850 + }, + { + "epoch": 0.33, + "grad_norm": 0.6698493957519531, + "learning_rate": 0.0001520766482400358, + "loss": 0.8467, + "step": 3855 + }, + { + "epoch": 0.33, + "grad_norm": 0.5901235342025757, + "learning_rate": 0.0001519633242120561, + "loss": 0.8262, + "step": 3860 + }, + { + "epoch": 0.33, + "grad_norm": 0.6284335851669312, + "learning_rate": 0.0001518499086927455, + "loss": 0.6594, + "step": 3865 + }, + { + "epoch": 0.33, + "grad_norm": 0.6574525833129883, + "learning_rate": 0.00015173640188179363, + "loss": 0.8159, + "step": 3870 + }, + { + "epoch": 0.33, + "grad_norm": 0.6073183417320251, + "learning_rate": 0.00015162280397905086, + "loss": 0.9547, + "step": 3875 + }, + { + "epoch": 0.33, + "grad_norm": 0.5954556465148926, + "learning_rate": 0.00015150911518452793, + "loss": 0.9168, + "step": 3880 + }, + { + "epoch": 0.33, + "grad_norm": 0.734402596950531, + "learning_rate": 0.00015139533569839565, + "loss": 0.8784, + "step": 3885 + }, + { + "epoch": 0.33, + "grad_norm": 0.7888867259025574, + "learning_rate": 0.00015128146572098442, + "loss": 0.9391, + "step": 3890 + }, + { + "epoch": 0.33, + "grad_norm": 0.5263239145278931, + "learning_rate": 0.00015116750545278408, + "loss": 0.8745, + "step": 3895 + }, + { + "epoch": 0.33, + "grad_norm": 0.7067481279373169, + "learning_rate": 0.00015105345509444336, + "loss": 0.8533, + "step": 3900 + }, + { + "epoch": 0.33, + "grad_norm": 0.6724989414215088, + "learning_rate": 0.00015093931484676967, + "loss": 0.889, + "step": 3905 + }, + { + "epoch": 0.33, + "grad_norm": 0.6885275840759277, + "learning_rate": 0.00015082508491072864, + "loss": 0.9317, + "step": 3910 + }, + { + "epoch": 0.33, + "grad_norm": 0.6939470767974854, + "learning_rate": 0.00015071076548744386, + "loss": 0.9193, + "step": 3915 + }, + { + "epoch": 0.33, + "grad_norm": 0.781311571598053, + "learning_rate": 0.00015059635677819636, + "loss": 1.0144, + "step": 3920 + }, + { + "epoch": 0.33, + "grad_norm": 0.6873858571052551, + "learning_rate": 0.00015048185898442463, + "loss": 0.8466, + "step": 3925 + }, + { + "epoch": 0.33, + "grad_norm": 0.7110685110092163, + "learning_rate": 0.00015036727230772367, + "loss": 0.9595, + "step": 3930 + }, + { + "epoch": 0.33, + "grad_norm": 0.7505444288253784, + "learning_rate": 0.00015025259694984524, + "loss": 0.9217, + "step": 3935 + }, + { + "epoch": 0.33, + "grad_norm": 0.5211585164070129, + "learning_rate": 0.0001501378331126972, + "loss": 0.9427, + "step": 3940 + }, + { + "epoch": 0.33, + "grad_norm": 0.6653823256492615, + "learning_rate": 0.00015002298099834303, + "loss": 0.8937, + "step": 3945 + }, + { + "epoch": 0.33, + "grad_norm": 0.7119855880737305, + "learning_rate": 0.00014990804080900185, + "loss": 0.9794, + "step": 3950 + }, + { + "epoch": 0.33, + "grad_norm": 0.6810693144798279, + "learning_rate": 0.0001497930127470477, + "loss": 1.0353, + "step": 3955 + }, + { + "epoch": 0.33, + "grad_norm": 0.5851398706436157, + "learning_rate": 0.00014967789701500944, + "loss": 0.9098, + "step": 3960 + }, + { + "epoch": 0.33, + "grad_norm": 0.7086507678031921, + "learning_rate": 0.00014956269381557024, + "loss": 0.8536, + "step": 3965 + }, + { + "epoch": 0.34, + "grad_norm": 0.595748782157898, + "learning_rate": 0.00014944740335156724, + "loss": 0.9451, + "step": 3970 + }, + { + "epoch": 0.34, + "grad_norm": 0.7851556539535522, + "learning_rate": 0.0001493320258259913, + "loss": 1.0031, + "step": 3975 + }, + { + "epoch": 0.34, + "grad_norm": 0.6311838626861572, + "learning_rate": 0.00014921656144198652, + "loss": 0.8833, + "step": 3980 + }, + { + "epoch": 0.34, + "grad_norm": 0.5700263977050781, + "learning_rate": 0.00014910101040284992, + "loss": 0.9191, + "step": 3985 + }, + { + "epoch": 0.34, + "grad_norm": 0.7289249897003174, + "learning_rate": 0.00014898537291203117, + "loss": 0.849, + "step": 3990 + }, + { + "epoch": 0.34, + "grad_norm": 0.6655135154724121, + "learning_rate": 0.00014886964917313207, + "loss": 1.0639, + "step": 3995 + }, + { + "epoch": 0.34, + "grad_norm": 0.6108108758926392, + "learning_rate": 0.00014875383938990627, + "loss": 0.8746, + "step": 4000 + }, + { + "epoch": 0.34, + "grad_norm": 0.7380669713020325, + "learning_rate": 0.00014863794376625904, + "loss": 0.8703, + "step": 4005 + }, + { + "epoch": 0.34, + "grad_norm": 0.6507090330123901, + "learning_rate": 0.00014852196250624662, + "loss": 0.9557, + "step": 4010 + }, + { + "epoch": 0.34, + "grad_norm": 0.6174106001853943, + "learning_rate": 0.00014840589581407616, + "loss": 0.9274, + "step": 4015 + }, + { + "epoch": 0.34, + "grad_norm": 0.6442661881446838, + "learning_rate": 0.00014828974389410516, + "loss": 0.9409, + "step": 4020 + }, + { + "epoch": 0.34, + "grad_norm": 0.8616316318511963, + "learning_rate": 0.0001481735069508412, + "loss": 0.8272, + "step": 4025 + }, + { + "epoch": 0.34, + "grad_norm": 0.6512661576271057, + "learning_rate": 0.00014805718518894157, + "loss": 0.8498, + "step": 4030 + }, + { + "epoch": 0.34, + "grad_norm": 0.6582963466644287, + "learning_rate": 0.00014794077881321292, + "loss": 0.9465, + "step": 4035 + }, + { + "epoch": 0.34, + "grad_norm": 0.6269610524177551, + "learning_rate": 0.0001478242880286108, + "loss": 0.9631, + "step": 4040 + }, + { + "epoch": 0.34, + "grad_norm": 0.5856170058250427, + "learning_rate": 0.00014770771304023942, + "loss": 0.8131, + "step": 4045 + }, + { + "epoch": 0.34, + "grad_norm": 0.5877073407173157, + "learning_rate": 0.00014759105405335132, + "loss": 0.8252, + "step": 4050 + }, + { + "epoch": 0.34, + "grad_norm": 0.6415336728096008, + "learning_rate": 0.00014747431127334678, + "loss": 0.9773, + "step": 4055 + }, + { + "epoch": 0.34, + "grad_norm": 0.7174964547157288, + "learning_rate": 0.0001473574849057738, + "loss": 1.0136, + "step": 4060 + }, + { + "epoch": 0.34, + "grad_norm": 0.5549941062927246, + "learning_rate": 0.00014724057515632738, + "loss": 0.9079, + "step": 4065 + }, + { + "epoch": 0.34, + "grad_norm": 0.7842785716056824, + "learning_rate": 0.00014712358223084942, + "loss": 0.8031, + "step": 4070 + }, + { + "epoch": 0.34, + "grad_norm": 0.5719451308250427, + "learning_rate": 0.00014700650633532827, + "loss": 0.8293, + "step": 4075 + }, + { + "epoch": 0.34, + "grad_norm": 0.742992639541626, + "learning_rate": 0.00014688934767589833, + "loss": 0.9294, + "step": 4080 + }, + { + "epoch": 0.35, + "grad_norm": 0.608527421951294, + "learning_rate": 0.00014677210645883977, + "loss": 0.9394, + "step": 4085 + }, + { + "epoch": 0.35, + "grad_norm": 0.533574104309082, + "learning_rate": 0.00014665478289057805, + "loss": 0.8738, + "step": 4090 + }, + { + "epoch": 0.35, + "grad_norm": 0.7454668283462524, + "learning_rate": 0.00014653737717768367, + "loss": 0.983, + "step": 4095 + }, + { + "epoch": 0.35, + "grad_norm": 0.6168544888496399, + "learning_rate": 0.00014641988952687177, + "loss": 0.9008, + "step": 4100 + }, + { + "epoch": 0.35, + "grad_norm": 0.6677118539810181, + "learning_rate": 0.0001463023201450017, + "loss": 0.8957, + "step": 4105 + }, + { + "epoch": 0.35, + "grad_norm": 0.6591238379478455, + "learning_rate": 0.00014618466923907678, + "loss": 0.9938, + "step": 4110 + }, + { + "epoch": 0.35, + "grad_norm": 0.5012399554252625, + "learning_rate": 0.00014606693701624385, + "loss": 0.855, + "step": 4115 + }, + { + "epoch": 0.35, + "grad_norm": 0.6333223581314087, + "learning_rate": 0.0001459491236837929, + "loss": 0.8756, + "step": 4120 + }, + { + "epoch": 0.35, + "grad_norm": 0.513618528842926, + "learning_rate": 0.00014583122944915672, + "loss": 0.9169, + "step": 4125 + }, + { + "epoch": 0.35, + "grad_norm": 0.6848164796829224, + "learning_rate": 0.00014571325451991066, + "loss": 0.9924, + "step": 4130 + }, + { + "epoch": 0.35, + "grad_norm": 0.6335467100143433, + "learning_rate": 0.00014559519910377193, + "loss": 0.7958, + "step": 4135 + }, + { + "epoch": 0.35, + "grad_norm": 0.6131249666213989, + "learning_rate": 0.0001454770634085997, + "loss": 0.9621, + "step": 4140 + }, + { + "epoch": 0.35, + "grad_norm": 0.6990880370140076, + "learning_rate": 0.00014535884764239424, + "loss": 1.0121, + "step": 4145 + }, + { + "epoch": 0.35, + "grad_norm": 0.7077513933181763, + "learning_rate": 0.00014524055201329704, + "loss": 0.9293, + "step": 4150 + }, + { + "epoch": 0.35, + "grad_norm": 0.533998966217041, + "learning_rate": 0.00014512217672959003, + "loss": 0.8922, + "step": 4155 + }, + { + "epoch": 0.35, + "grad_norm": 0.6833199858665466, + "learning_rate": 0.00014500372199969546, + "loss": 0.9573, + "step": 4160 + }, + { + "epoch": 0.35, + "grad_norm": 0.6293190717697144, + "learning_rate": 0.00014488518803217542, + "loss": 1.0084, + "step": 4165 + }, + { + "epoch": 0.35, + "grad_norm": 0.6474047303199768, + "learning_rate": 0.0001447665750357316, + "loss": 0.8068, + "step": 4170 + }, + { + "epoch": 0.35, + "grad_norm": 0.7276792526245117, + "learning_rate": 0.00014464788321920472, + "loss": 0.9225, + "step": 4175 + }, + { + "epoch": 0.35, + "grad_norm": 0.63521808385849, + "learning_rate": 0.00014452911279157435, + "loss": 0.8718, + "step": 4180 + }, + { + "epoch": 0.35, + "grad_norm": 0.720392644405365, + "learning_rate": 0.0001444102639619585, + "loss": 0.9408, + "step": 4185 + }, + { + "epoch": 0.35, + "grad_norm": 0.707635223865509, + "learning_rate": 0.00014429133693961304, + "loss": 0.9199, + "step": 4190 + }, + { + "epoch": 0.35, + "grad_norm": 0.5724509954452515, + "learning_rate": 0.0001441723319339318, + "loss": 1.0271, + "step": 4195 + }, + { + "epoch": 0.35, + "grad_norm": 0.6615992784500122, + "learning_rate": 0.00014405324915444572, + "loss": 0.858, + "step": 4200 + }, + { + "epoch": 0.36, + "grad_norm": 0.533299446105957, + "learning_rate": 0.00014393408881082265, + "loss": 0.9178, + "step": 4205 + }, + { + "epoch": 0.36, + "grad_norm": 0.6693369746208191, + "learning_rate": 0.00014381485111286714, + "loss": 0.8711, + "step": 4210 + }, + { + "epoch": 0.36, + "grad_norm": 0.6453130841255188, + "learning_rate": 0.00014369553627051982, + "loss": 0.9083, + "step": 4215 + }, + { + "epoch": 0.36, + "grad_norm": 0.7167688608169556, + "learning_rate": 0.0001435761444938573, + "loss": 0.9752, + "step": 4220 + }, + { + "epoch": 0.36, + "grad_norm": 0.602178156375885, + "learning_rate": 0.00014345667599309142, + "loss": 0.9038, + "step": 4225 + }, + { + "epoch": 0.36, + "grad_norm": 0.6880233883857727, + "learning_rate": 0.0001433371309785693, + "loss": 0.9242, + "step": 4230 + }, + { + "epoch": 0.36, + "grad_norm": 0.8023783564567566, + "learning_rate": 0.0001432175096607727, + "loss": 0.9589, + "step": 4235 + }, + { + "epoch": 0.36, + "grad_norm": 0.6405156254768372, + "learning_rate": 0.00014309781225031778, + "loss": 0.9669, + "step": 4240 + }, + { + "epoch": 0.36, + "grad_norm": 0.47613513469696045, + "learning_rate": 0.00014297803895795455, + "loss": 0.9187, + "step": 4245 + }, + { + "epoch": 0.36, + "grad_norm": 0.7372537851333618, + "learning_rate": 0.00014285818999456676, + "loss": 0.8633, + "step": 4250 + }, + { + "epoch": 0.36, + "grad_norm": 0.7306585311889648, + "learning_rate": 0.0001427382655711713, + "loss": 0.96, + "step": 4255 + }, + { + "epoch": 0.36, + "grad_norm": 0.6008553504943848, + "learning_rate": 0.000142618265898918, + "loss": 0.8579, + "step": 4260 + }, + { + "epoch": 0.36, + "grad_norm": 0.6557427048683167, + "learning_rate": 0.00014249819118908915, + "loss": 0.9197, + "step": 4265 + }, + { + "epoch": 0.36, + "grad_norm": 0.6179531216621399, + "learning_rate": 0.00014237804165309913, + "loss": 0.9992, + "step": 4270 + }, + { + "epoch": 0.36, + "grad_norm": 0.5417391061782837, + "learning_rate": 0.0001422578175024941, + "loss": 0.9273, + "step": 4275 + }, + { + "epoch": 0.36, + "grad_norm": 0.5395805835723877, + "learning_rate": 0.00014213751894895154, + "loss": 0.9608, + "step": 4280 + }, + { + "epoch": 0.36, + "grad_norm": 0.6742745041847229, + "learning_rate": 0.00014201714620428, + "loss": 0.9618, + "step": 4285 + }, + { + "epoch": 0.36, + "grad_norm": 0.7993083596229553, + "learning_rate": 0.00014189669948041863, + "loss": 0.8944, + "step": 4290 + }, + { + "epoch": 0.36, + "grad_norm": 1.0645782947540283, + "learning_rate": 0.00014177617898943683, + "loss": 0.8406, + "step": 4295 + }, + { + "epoch": 0.36, + "grad_norm": 0.6639379262924194, + "learning_rate": 0.00014165558494353385, + "loss": 0.9971, + "step": 4300 + }, + { + "epoch": 0.36, + "grad_norm": 0.6995848417282104, + "learning_rate": 0.00014153491755503853, + "loss": 1.0182, + "step": 4305 + }, + { + "epoch": 0.36, + "grad_norm": 0.762792706489563, + "learning_rate": 0.00014141417703640875, + "loss": 0.98, + "step": 4310 + }, + { + "epoch": 0.36, + "grad_norm": 0.7014017701148987, + "learning_rate": 0.0001412933636002312, + "loss": 0.895, + "step": 4315 + }, + { + "epoch": 0.36, + "grad_norm": 0.6615109443664551, + "learning_rate": 0.00014117247745922101, + "loss": 0.9504, + "step": 4320 + }, + { + "epoch": 0.37, + "grad_norm": 0.7112694978713989, + "learning_rate": 0.00014105151882622122, + "loss": 0.8517, + "step": 4325 + }, + { + "epoch": 0.37, + "grad_norm": 0.6270447373390198, + "learning_rate": 0.00014093048791420252, + "loss": 0.7386, + "step": 4330 + }, + { + "epoch": 0.37, + "grad_norm": 0.7246109247207642, + "learning_rate": 0.00014080938493626286, + "loss": 0.9284, + "step": 4335 + }, + { + "epoch": 0.37, + "grad_norm": 0.7187846899032593, + "learning_rate": 0.00014068821010562718, + "loss": 0.9285, + "step": 4340 + }, + { + "epoch": 0.37, + "grad_norm": 0.8331519365310669, + "learning_rate": 0.00014056696363564682, + "loss": 0.9343, + "step": 4345 + }, + { + "epoch": 0.37, + "grad_norm": 0.6256982684135437, + "learning_rate": 0.00014044564573979925, + "loss": 0.9124, + "step": 4350 + }, + { + "epoch": 0.37, + "grad_norm": 0.7178502678871155, + "learning_rate": 0.0001403242566316878, + "loss": 0.9675, + "step": 4355 + }, + { + "epoch": 0.37, + "grad_norm": 0.5572247505187988, + "learning_rate": 0.0001402027965250411, + "loss": 0.9588, + "step": 4360 + }, + { + "epoch": 0.37, + "grad_norm": 0.7772032022476196, + "learning_rate": 0.00014008126563371274, + "loss": 0.9215, + "step": 4365 + }, + { + "epoch": 0.37, + "grad_norm": 0.7644178867340088, + "learning_rate": 0.0001399596641716811, + "loss": 1.0078, + "step": 4370 + }, + { + "epoch": 0.37, + "grad_norm": 0.5885058641433716, + "learning_rate": 0.0001398379923530487, + "loss": 0.8546, + "step": 4375 + }, + { + "epoch": 0.37, + "grad_norm": 0.6888991594314575, + "learning_rate": 0.0001397162503920419, + "loss": 0.9105, + "step": 4380 + }, + { + "epoch": 0.37, + "grad_norm": 0.7413958311080933, + "learning_rate": 0.00013959443850301061, + "loss": 0.8351, + "step": 4385 + }, + { + "epoch": 0.37, + "grad_norm": 0.6878228783607483, + "learning_rate": 0.00013947255690042795, + "loss": 0.9956, + "step": 4390 + }, + { + "epoch": 0.37, + "grad_norm": 0.7583858370780945, + "learning_rate": 0.00013935060579888962, + "loss": 0.8068, + "step": 4395 + }, + { + "epoch": 0.37, + "grad_norm": 0.5721178650856018, + "learning_rate": 0.00013922858541311382, + "loss": 0.8582, + "step": 4400 + }, + { + "epoch": 0.37, + "grad_norm": 0.6023159623146057, + "learning_rate": 0.00013910649595794058, + "loss": 0.9859, + "step": 4405 + }, + { + "epoch": 0.37, + "grad_norm": 0.5773957967758179, + "learning_rate": 0.00013898433764833178, + "loss": 0.8051, + "step": 4410 + }, + { + "epoch": 0.37, + "grad_norm": 0.5671504735946655, + "learning_rate": 0.00013886211069937034, + "loss": 1.0797, + "step": 4415 + }, + { + "epoch": 0.37, + "grad_norm": 0.6403985619544983, + "learning_rate": 0.00013873981532626007, + "loss": 0.8315, + "step": 4420 + }, + { + "epoch": 0.37, + "grad_norm": 0.9393098950386047, + "learning_rate": 0.00013861745174432525, + "loss": 0.9213, + "step": 4425 + }, + { + "epoch": 0.37, + "grad_norm": 0.6221628785133362, + "learning_rate": 0.00013849502016901035, + "loss": 0.8995, + "step": 4430 + }, + { + "epoch": 0.37, + "grad_norm": 0.6712929606437683, + "learning_rate": 0.00013837252081587938, + "loss": 0.9216, + "step": 4435 + }, + { + "epoch": 0.38, + "grad_norm": 0.7631377577781677, + "learning_rate": 0.0001382499539006159, + "loss": 0.8936, + "step": 4440 + }, + { + "epoch": 0.38, + "grad_norm": 0.6782535314559937, + "learning_rate": 0.00013812731963902224, + "loss": 0.8619, + "step": 4445 + }, + { + "epoch": 0.38, + "grad_norm": 0.8166306614875793, + "learning_rate": 0.0001380046182470194, + "loss": 0.8772, + "step": 4450 + }, + { + "epoch": 0.38, + "grad_norm": 0.6431621313095093, + "learning_rate": 0.0001378818499406465, + "loss": 0.9215, + "step": 4455 + }, + { + "epoch": 0.38, + "grad_norm": 0.6358498930931091, + "learning_rate": 0.00013775901493606063, + "loss": 0.8781, + "step": 4460 + }, + { + "epoch": 0.38, + "grad_norm": 0.5732253193855286, + "learning_rate": 0.0001376361134495361, + "loss": 0.9351, + "step": 4465 + }, + { + "epoch": 0.38, + "grad_norm": 0.7262488007545471, + "learning_rate": 0.0001375131456974645, + "loss": 0.8956, + "step": 4470 + }, + { + "epoch": 0.38, + "grad_norm": 0.7199252843856812, + "learning_rate": 0.0001373901118963539, + "loss": 0.952, + "step": 4475 + }, + { + "epoch": 0.38, + "grad_norm": 0.5929174423217773, + "learning_rate": 0.00013726701226282885, + "loss": 0.9268, + "step": 4480 + }, + { + "epoch": 0.38, + "grad_norm": 0.8617218136787415, + "learning_rate": 0.00013714384701362956, + "loss": 0.9656, + "step": 4485 + }, + { + "epoch": 0.38, + "grad_norm": 0.5977094173431396, + "learning_rate": 0.000137020616365612, + "loss": 0.7784, + "step": 4490 + }, + { + "epoch": 0.38, + "grad_norm": 0.7792064547538757, + "learning_rate": 0.0001368973205357472, + "loss": 0.932, + "step": 4495 + }, + { + "epoch": 0.38, + "grad_norm": 0.7215954065322876, + "learning_rate": 0.00013677395974112094, + "loss": 0.9451, + "step": 4500 + }, + { + "epoch": 0.38, + "grad_norm": 0.8179596066474915, + "learning_rate": 0.00013665053419893337, + "loss": 0.8776, + "step": 4505 + }, + { + "epoch": 0.38, + "grad_norm": 0.6614099144935608, + "learning_rate": 0.0001365270441264987, + "loss": 0.8245, + "step": 4510 + }, + { + "epoch": 0.38, + "grad_norm": 0.6148139834403992, + "learning_rate": 0.00013640348974124474, + "loss": 0.8813, + "step": 4515 + }, + { + "epoch": 0.38, + "grad_norm": 0.7663241624832153, + "learning_rate": 0.0001362798712607125, + "loss": 0.981, + "step": 4520 + }, + { + "epoch": 0.38, + "grad_norm": 0.6683364510536194, + "learning_rate": 0.00013615618890255589, + "loss": 0.9146, + "step": 4525 + }, + { + "epoch": 0.38, + "grad_norm": 0.644741952419281, + "learning_rate": 0.0001360324428845412, + "loss": 0.9912, + "step": 4530 + }, + { + "epoch": 0.38, + "grad_norm": 0.6618791818618774, + "learning_rate": 0.00013590863342454693, + "loss": 0.8832, + "step": 4535 + }, + { + "epoch": 0.38, + "grad_norm": 0.5592582821846008, + "learning_rate": 0.0001357847607405632, + "loss": 0.8504, + "step": 4540 + }, + { + "epoch": 0.38, + "grad_norm": 0.7591463327407837, + "learning_rate": 0.00013566082505069143, + "loss": 1.0021, + "step": 4545 + }, + { + "epoch": 0.38, + "grad_norm": 0.5851222276687622, + "learning_rate": 0.00013553682657314412, + "loss": 0.8535, + "step": 4550 + }, + { + "epoch": 0.38, + "grad_norm": 0.6739622354507446, + "learning_rate": 0.00013541276552624405, + "loss": 0.8448, + "step": 4555 + }, + { + "epoch": 0.39, + "grad_norm": 0.6252307295799255, + "learning_rate": 0.00013528864212842444, + "loss": 0.8135, + "step": 4560 + }, + { + "epoch": 0.39, + "grad_norm": 0.9975850582122803, + "learning_rate": 0.00013516445659822815, + "loss": 0.9283, + "step": 4565 + }, + { + "epoch": 0.39, + "grad_norm": 0.6981391310691833, + "learning_rate": 0.00013504020915430746, + "loss": 0.9259, + "step": 4570 + }, + { + "epoch": 0.39, + "grad_norm": 0.5520308017730713, + "learning_rate": 0.00013491590001542367, + "loss": 0.8799, + "step": 4575 + }, + { + "epoch": 0.39, + "grad_norm": 0.6810752153396606, + "learning_rate": 0.00013479152940044665, + "loss": 0.9626, + "step": 4580 + }, + { + "epoch": 0.39, + "grad_norm": 0.7570480108261108, + "learning_rate": 0.00013466709752835466, + "loss": 0.9614, + "step": 4585 + }, + { + "epoch": 0.39, + "grad_norm": 0.7146151661872864, + "learning_rate": 0.00013454260461823365, + "loss": 0.8655, + "step": 4590 + }, + { + "epoch": 0.39, + "grad_norm": 0.6674631834030151, + "learning_rate": 0.00013441805088927706, + "loss": 0.9221, + "step": 4595 + }, + { + "epoch": 0.39, + "grad_norm": 0.6004106998443604, + "learning_rate": 0.00013429343656078555, + "loss": 0.8091, + "step": 4600 + }, + { + "epoch": 0.39, + "grad_norm": 0.5462713241577148, + "learning_rate": 0.0001341687618521663, + "loss": 0.8159, + "step": 4605 + }, + { + "epoch": 0.39, + "grad_norm": 0.7340354323387146, + "learning_rate": 0.00013404402698293294, + "loss": 0.8762, + "step": 4610 + }, + { + "epoch": 0.39, + "grad_norm": 0.6642255783081055, + "learning_rate": 0.00013391923217270497, + "loss": 0.9129, + "step": 4615 + }, + { + "epoch": 0.39, + "grad_norm": 0.7021158933639526, + "learning_rate": 0.00013379437764120738, + "loss": 0.8938, + "step": 4620 + }, + { + "epoch": 0.39, + "grad_norm": 0.6833433508872986, + "learning_rate": 0.00013366946360827037, + "loss": 0.9181, + "step": 4625 + }, + { + "epoch": 0.39, + "grad_norm": 0.7175443172454834, + "learning_rate": 0.00013354449029382893, + "loss": 0.7676, + "step": 4630 + }, + { + "epoch": 0.39, + "grad_norm": 0.6878382563591003, + "learning_rate": 0.00013341945791792238, + "loss": 0.9573, + "step": 4635 + }, + { + "epoch": 0.39, + "grad_norm": 0.7180898189544678, + "learning_rate": 0.00013329436670069395, + "loss": 0.8508, + "step": 4640 + }, + { + "epoch": 0.39, + "grad_norm": 0.6059539914131165, + "learning_rate": 0.0001331692168623907, + "loss": 0.9017, + "step": 4645 + }, + { + "epoch": 0.39, + "grad_norm": 0.6651197671890259, + "learning_rate": 0.00013304400862336263, + "loss": 0.8017, + "step": 4650 + }, + { + "epoch": 0.39, + "grad_norm": 0.7838413119316101, + "learning_rate": 0.00013291874220406274, + "loss": 0.988, + "step": 4655 + }, + { + "epoch": 0.39, + "grad_norm": 0.6739104986190796, + "learning_rate": 0.00013279341782504645, + "loss": 0.9072, + "step": 4660 + }, + { + "epoch": 0.39, + "grad_norm": 0.6881901025772095, + "learning_rate": 0.00013266803570697116, + "loss": 0.8914, + "step": 4665 + }, + { + "epoch": 0.39, + "grad_norm": 0.6005337834358215, + "learning_rate": 0.00013254259607059605, + "loss": 0.9813, + "step": 4670 + }, + { + "epoch": 0.39, + "grad_norm": 0.6260716319084167, + "learning_rate": 0.0001324170991367814, + "loss": 0.8676, + "step": 4675 + }, + { + "epoch": 0.4, + "grad_norm": 0.644090473651886, + "learning_rate": 0.0001322915451264885, + "loss": 0.8814, + "step": 4680 + }, + { + "epoch": 0.4, + "grad_norm": 0.6466627717018127, + "learning_rate": 0.00013216593426077918, + "loss": 0.9372, + "step": 4685 + }, + { + "epoch": 0.4, + "grad_norm": 0.6953723430633545, + "learning_rate": 0.00013204026676081517, + "loss": 0.9277, + "step": 4690 + }, + { + "epoch": 0.4, + "grad_norm": 0.8439871668815613, + "learning_rate": 0.0001319145428478581, + "loss": 0.8426, + "step": 4695 + }, + { + "epoch": 0.4, + "grad_norm": 0.6791027784347534, + "learning_rate": 0.0001317887627432689, + "loss": 0.7959, + "step": 4700 + }, + { + "epoch": 0.4, + "grad_norm": 0.6005025506019592, + "learning_rate": 0.00013166292666850734, + "loss": 0.8866, + "step": 4705 + }, + { + "epoch": 0.4, + "grad_norm": 0.754767894744873, + "learning_rate": 0.00013153703484513186, + "loss": 0.9494, + "step": 4710 + }, + { + "epoch": 0.4, + "grad_norm": 0.6382832527160645, + "learning_rate": 0.00013141108749479898, + "loss": 0.889, + "step": 4715 + }, + { + "epoch": 0.4, + "grad_norm": 0.6113407611846924, + "learning_rate": 0.00013128508483926298, + "loss": 0.8678, + "step": 4720 + }, + { + "epoch": 0.4, + "grad_norm": 0.6342490315437317, + "learning_rate": 0.00013115902710037554, + "loss": 0.8763, + "step": 4725 + }, + { + "epoch": 0.4, + "grad_norm": 0.6291064620018005, + "learning_rate": 0.00013103291450008533, + "loss": 0.8743, + "step": 4730 + }, + { + "epoch": 0.4, + "grad_norm": 0.796549379825592, + "learning_rate": 0.00013090674726043766, + "loss": 0.8185, + "step": 4735 + }, + { + "epoch": 0.4, + "grad_norm": 0.6640483140945435, + "learning_rate": 0.0001307805256035739, + "loss": 0.8382, + "step": 4740 + }, + { + "epoch": 0.4, + "grad_norm": 0.7306240797042847, + "learning_rate": 0.00013065424975173135, + "loss": 0.8412, + "step": 4745 + }, + { + "epoch": 0.4, + "grad_norm": 0.8053780794143677, + "learning_rate": 0.00013052791992724275, + "loss": 0.9946, + "step": 4750 + }, + { + "epoch": 0.4, + "grad_norm": 0.6016142964363098, + "learning_rate": 0.00013040153635253575, + "loss": 0.9382, + "step": 4755 + }, + { + "epoch": 0.4, + "grad_norm": 0.6706305146217346, + "learning_rate": 0.00013027509925013275, + "loss": 0.9322, + "step": 4760 + }, + { + "epoch": 0.4, + "grad_norm": 0.7635506391525269, + "learning_rate": 0.00013014860884265036, + "loss": 0.9591, + "step": 4765 + }, + { + "epoch": 0.4, + "grad_norm": 0.7298561930656433, + "learning_rate": 0.000130022065352799, + "loss": 0.8639, + "step": 4770 + }, + { + "epoch": 0.4, + "grad_norm": 0.6929640173912048, + "learning_rate": 0.00012989546900338264, + "loss": 0.9269, + "step": 4775 + }, + { + "epoch": 0.4, + "grad_norm": 0.7712711691856384, + "learning_rate": 0.00012976882001729823, + "loss": 0.8759, + "step": 4780 + }, + { + "epoch": 0.4, + "grad_norm": 0.6714412569999695, + "learning_rate": 0.00012964211861753543, + "loss": 0.8264, + "step": 4785 + }, + { + "epoch": 0.4, + "grad_norm": 0.7652319669723511, + "learning_rate": 0.00012951536502717623, + "loss": 0.8953, + "step": 4790 + }, + { + "epoch": 0.41, + "grad_norm": 0.6817525029182434, + "learning_rate": 0.00012938855946939443, + "loss": 0.9328, + "step": 4795 + }, + { + "epoch": 0.41, + "grad_norm": 0.6716194748878479, + "learning_rate": 0.0001292617021674554, + "loss": 0.9282, + "step": 4800 + }, + { + "epoch": 0.41, + "grad_norm": 0.7245244383811951, + "learning_rate": 0.00012913479334471557, + "loss": 0.9207, + "step": 4805 + }, + { + "epoch": 0.41, + "grad_norm": 0.6501569151878357, + "learning_rate": 0.0001290078332246221, + "loss": 0.9289, + "step": 4810 + }, + { + "epoch": 0.41, + "grad_norm": 0.5875827074050903, + "learning_rate": 0.0001288808220307125, + "loss": 0.9092, + "step": 4815 + }, + { + "epoch": 0.41, + "grad_norm": 0.6515260338783264, + "learning_rate": 0.0001287537599866141, + "loss": 0.7633, + "step": 4820 + }, + { + "epoch": 0.41, + "grad_norm": 0.6986538767814636, + "learning_rate": 0.00012862664731604388, + "loss": 0.8686, + "step": 4825 + }, + { + "epoch": 0.41, + "grad_norm": 0.5599557161331177, + "learning_rate": 0.0001284994842428079, + "loss": 0.8821, + "step": 4830 + }, + { + "epoch": 0.41, + "grad_norm": 0.7143694758415222, + "learning_rate": 0.00012837227099080098, + "loss": 1.0317, + "step": 4835 + }, + { + "epoch": 0.41, + "grad_norm": 0.6995121836662292, + "learning_rate": 0.00012824500778400627, + "loss": 0.897, + "step": 4840 + }, + { + "epoch": 0.41, + "grad_norm": 0.6644828915596008, + "learning_rate": 0.00012811769484649492, + "loss": 0.9869, + "step": 4845 + }, + { + "epoch": 0.41, + "grad_norm": 0.7595239877700806, + "learning_rate": 0.0001279903324024256, + "loss": 0.9084, + "step": 4850 + }, + { + "epoch": 0.41, + "grad_norm": 0.6890538334846497, + "learning_rate": 0.0001278629206760441, + "loss": 0.8771, + "step": 4855 + }, + { + "epoch": 0.41, + "grad_norm": 0.7247397303581238, + "learning_rate": 0.0001277354598916831, + "loss": 0.9808, + "step": 4860 + }, + { + "epoch": 0.41, + "grad_norm": 0.6517183780670166, + "learning_rate": 0.00012760795027376158, + "loss": 0.8585, + "step": 4865 + }, + { + "epoch": 0.41, + "grad_norm": 0.7327473163604736, + "learning_rate": 0.00012748039204678446, + "loss": 0.9243, + "step": 4870 + }, + { + "epoch": 0.41, + "grad_norm": 0.5720110535621643, + "learning_rate": 0.00012735278543534243, + "loss": 0.8099, + "step": 4875 + }, + { + "epoch": 0.41, + "grad_norm": 0.7129859924316406, + "learning_rate": 0.00012722513066411103, + "loss": 0.9101, + "step": 4880 + }, + { + "epoch": 0.41, + "grad_norm": 0.6797164678573608, + "learning_rate": 0.00012709742795785097, + "loss": 0.903, + "step": 4885 + }, + { + "epoch": 0.41, + "grad_norm": 0.6795310974121094, + "learning_rate": 0.00012696967754140714, + "loss": 0.9116, + "step": 4890 + }, + { + "epoch": 0.41, + "grad_norm": 0.8723884224891663, + "learning_rate": 0.00012684187963970847, + "loss": 0.9222, + "step": 4895 + }, + { + "epoch": 0.41, + "grad_norm": 0.6401852965354919, + "learning_rate": 0.00012671403447776753, + "loss": 0.8375, + "step": 4900 + }, + { + "epoch": 0.41, + "grad_norm": 0.6854673027992249, + "learning_rate": 0.00012658614228068003, + "loss": 0.8835, + "step": 4905 + }, + { + "epoch": 0.41, + "grad_norm": 0.6760403513908386, + "learning_rate": 0.00012645820327362466, + "loss": 1.0362, + "step": 4910 + }, + { + "epoch": 0.42, + "grad_norm": 0.8172838687896729, + "learning_rate": 0.0001263302176818623, + "loss": 1.0059, + "step": 4915 + }, + { + "epoch": 0.42, + "grad_norm": 0.6979231834411621, + "learning_rate": 0.000126202185730736, + "loss": 1.0049, + "step": 4920 + }, + { + "epoch": 0.42, + "grad_norm": 0.675632894039154, + "learning_rate": 0.00012607410764567045, + "loss": 0.8314, + "step": 4925 + }, + { + "epoch": 0.42, + "grad_norm": 0.7185483574867249, + "learning_rate": 0.00012594598365217144, + "loss": 0.8724, + "step": 4930 + }, + { + "epoch": 0.42, + "grad_norm": 0.7700892686843872, + "learning_rate": 0.00012581781397582567, + "loss": 0.932, + "step": 4935 + }, + { + "epoch": 0.42, + "grad_norm": 0.6535851359367371, + "learning_rate": 0.00012568959884230036, + "loss": 0.8531, + "step": 4940 + }, + { + "epoch": 0.42, + "grad_norm": 0.6171019673347473, + "learning_rate": 0.0001255613384773426, + "loss": 0.8942, + "step": 4945 + }, + { + "epoch": 0.42, + "grad_norm": 0.5885823369026184, + "learning_rate": 0.0001254330331067792, + "loss": 1.0151, + "step": 4950 + }, + { + "epoch": 0.42, + "grad_norm": 0.7443267703056335, + "learning_rate": 0.00012530468295651617, + "loss": 1.0425, + "step": 4955 + }, + { + "epoch": 0.42, + "grad_norm": 0.6809262633323669, + "learning_rate": 0.00012517628825253852, + "loss": 0.9637, + "step": 4960 + }, + { + "epoch": 0.42, + "grad_norm": 0.5984592437744141, + "learning_rate": 0.00012504784922090945, + "loss": 0.9221, + "step": 4965 + }, + { + "epoch": 0.42, + "grad_norm": 0.6745832562446594, + "learning_rate": 0.00012491936608777045, + "loss": 0.7759, + "step": 4970 + }, + { + "epoch": 0.42, + "grad_norm": 0.6799601912498474, + "learning_rate": 0.00012479083907934052, + "loss": 0.8416, + "step": 4975 + }, + { + "epoch": 0.42, + "grad_norm": 0.658454954624176, + "learning_rate": 0.00012466226842191587, + "loss": 0.8171, + "step": 4980 + }, + { + "epoch": 0.42, + "grad_norm": 0.6778995990753174, + "learning_rate": 0.00012453365434186975, + "loss": 0.9652, + "step": 4985 + }, + { + "epoch": 0.42, + "grad_norm": 0.7929224371910095, + "learning_rate": 0.00012440499706565164, + "loss": 0.8463, + "step": 4990 + }, + { + "epoch": 0.42, + "grad_norm": 0.6529220342636108, + "learning_rate": 0.00012427629681978724, + "loss": 0.7954, + "step": 4995 + }, + { + "epoch": 0.42, + "grad_norm": 0.6256465315818787, + "learning_rate": 0.00012414755383087785, + "loss": 0.887, + "step": 5000 + }, + { + "epoch": 0.42, + "grad_norm": 0.6464981436729431, + "learning_rate": 0.0001240187683256, + "loss": 0.8077, + "step": 5005 + }, + { + "epoch": 0.42, + "grad_norm": 0.6251599788665771, + "learning_rate": 0.00012388994053070512, + "loss": 0.7699, + "step": 5010 + }, + { + "epoch": 0.42, + "grad_norm": 0.6544944643974304, + "learning_rate": 0.00012376107067301912, + "loss": 0.9582, + "step": 5015 + }, + { + "epoch": 0.42, + "grad_norm": 0.6476804614067078, + "learning_rate": 0.00012363215897944187, + "loss": 0.9299, + "step": 5020 + }, + { + "epoch": 0.42, + "grad_norm": 0.679531455039978, + "learning_rate": 0.000123503205676947, + "loss": 0.8737, + "step": 5025 + }, + { + "epoch": 0.42, + "grad_norm": 0.7165660858154297, + "learning_rate": 0.00012337421099258133, + "loss": 0.8456, + "step": 5030 + }, + { + "epoch": 0.43, + "grad_norm": 0.6283479332923889, + "learning_rate": 0.00012324517515346467, + "loss": 0.8646, + "step": 5035 + }, + { + "epoch": 0.43, + "grad_norm": 0.687667191028595, + "learning_rate": 0.00012311609838678905, + "loss": 0.9228, + "step": 5040 + }, + { + "epoch": 0.43, + "grad_norm": 0.7233263254165649, + "learning_rate": 0.0001229869809198188, + "loss": 0.9077, + "step": 5045 + }, + { + "epoch": 0.43, + "grad_norm": 0.8381845951080322, + "learning_rate": 0.00012285782297988984, + "loss": 0.8916, + "step": 5050 + }, + { + "epoch": 0.43, + "grad_norm": 0.5955893397331238, + "learning_rate": 0.00012272862479440922, + "loss": 0.8171, + "step": 5055 + }, + { + "epoch": 0.43, + "grad_norm": 0.7041948437690735, + "learning_rate": 0.00012259938659085504, + "loss": 0.941, + "step": 5060 + }, + { + "epoch": 0.43, + "grad_norm": 0.6682184338569641, + "learning_rate": 0.00012247010859677576, + "loss": 0.8761, + "step": 5065 + }, + { + "epoch": 0.43, + "grad_norm": 0.6222808361053467, + "learning_rate": 0.00012234079103978993, + "loss": 0.9773, + "step": 5070 + }, + { + "epoch": 0.43, + "grad_norm": 0.6586690545082092, + "learning_rate": 0.00012221143414758572, + "loss": 0.9188, + "step": 5075 + }, + { + "epoch": 0.43, + "grad_norm": 0.6669284105300903, + "learning_rate": 0.00012208203814792056, + "loss": 0.8918, + "step": 5080 + }, + { + "epoch": 0.43, + "grad_norm": 0.6407530903816223, + "learning_rate": 0.00012195260326862081, + "loss": 0.91, + "step": 5085 + }, + { + "epoch": 0.43, + "grad_norm": 0.9201768040657043, + "learning_rate": 0.00012182312973758118, + "loss": 0.9377, + "step": 5090 + }, + { + "epoch": 0.43, + "grad_norm": 0.6031652688980103, + "learning_rate": 0.00012169361778276451, + "loss": 0.9322, + "step": 5095 + }, + { + "epoch": 0.43, + "grad_norm": 0.5742199420928955, + "learning_rate": 0.00012156406763220128, + "loss": 1.0405, + "step": 5100 + }, + { + "epoch": 0.43, + "grad_norm": 0.8060709238052368, + "learning_rate": 0.0001214344795139892, + "loss": 0.8857, + "step": 5105 + }, + { + "epoch": 0.43, + "grad_norm": 0.7517329454421997, + "learning_rate": 0.0001213048536562928, + "loss": 0.9292, + "step": 5110 + }, + { + "epoch": 0.43, + "grad_norm": 0.5800279974937439, + "learning_rate": 0.00012117519028734317, + "loss": 0.9284, + "step": 5115 + }, + { + "epoch": 0.43, + "grad_norm": 0.5874794721603394, + "learning_rate": 0.00012104548963543729, + "loss": 0.9274, + "step": 5120 + }, + { + "epoch": 0.43, + "grad_norm": 0.9434084296226501, + "learning_rate": 0.00012091575192893789, + "loss": 0.8461, + "step": 5125 + }, + { + "epoch": 0.43, + "grad_norm": 0.6450409889221191, + "learning_rate": 0.00012078597739627297, + "loss": 0.9097, + "step": 5130 + }, + { + "epoch": 0.43, + "grad_norm": 0.7026113867759705, + "learning_rate": 0.00012065616626593528, + "loss": 0.9492, + "step": 5135 + }, + { + "epoch": 0.43, + "grad_norm": 0.742713451385498, + "learning_rate": 0.00012052631876648199, + "loss": 1.0525, + "step": 5140 + }, + { + "epoch": 0.43, + "grad_norm": 0.6497244238853455, + "learning_rate": 0.00012039643512653444, + "loss": 0.8872, + "step": 5145 + }, + { + "epoch": 0.44, + "grad_norm": 0.7265699505805969, + "learning_rate": 0.00012026651557477745, + "loss": 1.0148, + "step": 5150 + }, + { + "epoch": 0.44, + "grad_norm": 0.6718622446060181, + "learning_rate": 0.00012013656033995921, + "loss": 1.0182, + "step": 5155 + }, + { + "epoch": 0.44, + "grad_norm": 0.7481784820556641, + "learning_rate": 0.00012000656965089063, + "loss": 0.8686, + "step": 5160 + }, + { + "epoch": 0.44, + "grad_norm": 0.6820610165596008, + "learning_rate": 0.00011987654373644506, + "loss": 0.826, + "step": 5165 + }, + { + "epoch": 0.44, + "grad_norm": 0.5447041392326355, + "learning_rate": 0.00011974648282555794, + "loss": 0.8705, + "step": 5170 + }, + { + "epoch": 0.44, + "grad_norm": 0.7161911129951477, + "learning_rate": 0.00011961638714722623, + "loss": 0.9256, + "step": 5175 + }, + { + "epoch": 0.44, + "grad_norm": 0.9855062365531921, + "learning_rate": 0.00011948625693050816, + "loss": 0.8967, + "step": 5180 + }, + { + "epoch": 0.44, + "grad_norm": 0.6545083522796631, + "learning_rate": 0.00011935609240452281, + "loss": 0.7265, + "step": 5185 + }, + { + "epoch": 0.44, + "grad_norm": 0.7910086512565613, + "learning_rate": 0.00011922589379844961, + "loss": 0.914, + "step": 5190 + }, + { + "epoch": 0.44, + "grad_norm": 0.7386326193809509, + "learning_rate": 0.00011909566134152794, + "loss": 0.9195, + "step": 5195 + }, + { + "epoch": 0.44, + "grad_norm": 0.6439299583435059, + "learning_rate": 0.00011896539526305694, + "loss": 0.8454, + "step": 5200 + }, + { + "epoch": 0.44, + "grad_norm": 0.7418578267097473, + "learning_rate": 0.00011883509579239482, + "loss": 0.9424, + "step": 5205 + }, + { + "epoch": 0.44, + "grad_norm": 0.6000198125839233, + "learning_rate": 0.0001187047631589586, + "loss": 0.898, + "step": 5210 + }, + { + "epoch": 0.44, + "grad_norm": 0.7546732425689697, + "learning_rate": 0.00011857439759222373, + "loss": 0.9509, + "step": 5215 + }, + { + "epoch": 0.44, + "grad_norm": 0.6965680122375488, + "learning_rate": 0.00011844399932172362, + "loss": 0.9157, + "step": 5220 + }, + { + "epoch": 0.44, + "grad_norm": 0.7454926371574402, + "learning_rate": 0.00011831356857704927, + "loss": 0.9386, + "step": 5225 + }, + { + "epoch": 0.44, + "grad_norm": 0.6428052186965942, + "learning_rate": 0.00011818310558784882, + "loss": 0.8267, + "step": 5230 + }, + { + "epoch": 0.44, + "grad_norm": 0.7606145143508911, + "learning_rate": 0.00011805261058382723, + "loss": 0.9663, + "step": 5235 + }, + { + "epoch": 0.44, + "grad_norm": 0.7600546479225159, + "learning_rate": 0.0001179220837947459, + "loss": 1.0242, + "step": 5240 + }, + { + "epoch": 0.44, + "grad_norm": 0.6770562529563904, + "learning_rate": 0.00011779152545042195, + "loss": 0.9582, + "step": 5245 + }, + { + "epoch": 0.44, + "grad_norm": 0.7594524025917053, + "learning_rate": 0.00011766093578072832, + "loss": 0.9125, + "step": 5250 + }, + { + "epoch": 0.44, + "grad_norm": 0.6377832293510437, + "learning_rate": 0.00011753031501559298, + "loss": 0.9992, + "step": 5255 + }, + { + "epoch": 0.44, + "grad_norm": 0.8343694806098938, + "learning_rate": 0.00011739966338499866, + "loss": 0.9534, + "step": 5260 + }, + { + "epoch": 0.44, + "grad_norm": 0.645725667476654, + "learning_rate": 0.00011726898111898246, + "loss": 0.9256, + "step": 5265 + }, + { + "epoch": 0.45, + "grad_norm": 0.670018196105957, + "learning_rate": 0.00011713826844763538, + "loss": 0.8773, + "step": 5270 + }, + { + "epoch": 0.45, + "grad_norm": 0.7871105074882507, + "learning_rate": 0.000117007525601102, + "loss": 0.982, + "step": 5275 + }, + { + "epoch": 0.45, + "grad_norm": 0.6718564629554749, + "learning_rate": 0.00011687675280958, + "loss": 0.8211, + "step": 5280 + }, + { + "epoch": 0.45, + "grad_norm": 0.6720396280288696, + "learning_rate": 0.00011674595030331974, + "loss": 0.9162, + "step": 5285 + }, + { + "epoch": 0.45, + "grad_norm": 0.7001306414604187, + "learning_rate": 0.00011661511831262401, + "loss": 0.9542, + "step": 5290 + }, + { + "epoch": 0.45, + "grad_norm": 0.6401751637458801, + "learning_rate": 0.0001164842570678475, + "loss": 0.8765, + "step": 5295 + }, + { + "epoch": 0.45, + "grad_norm": 0.6717837452888489, + "learning_rate": 0.00011635336679939624, + "loss": 0.8623, + "step": 5300 + }, + { + "epoch": 0.45, + "grad_norm": 0.7279519438743591, + "learning_rate": 0.00011622244773772755, + "loss": 1.1157, + "step": 5305 + }, + { + "epoch": 0.45, + "grad_norm": 0.6708661913871765, + "learning_rate": 0.00011609150011334937, + "loss": 0.9421, + "step": 5310 + }, + { + "epoch": 0.45, + "grad_norm": 0.7468719482421875, + "learning_rate": 0.00011596052415681992, + "loss": 0.9219, + "step": 5315 + }, + { + "epoch": 0.45, + "grad_norm": 0.7473458647727966, + "learning_rate": 0.00011582952009874737, + "loss": 1.0115, + "step": 5320 + }, + { + "epoch": 0.45, + "grad_norm": 0.666793167591095, + "learning_rate": 0.00011569848816978924, + "loss": 0.8903, + "step": 5325 + }, + { + "epoch": 0.45, + "grad_norm": 0.7159398198127747, + "learning_rate": 0.00011556742860065226, + "loss": 0.8547, + "step": 5330 + }, + { + "epoch": 0.45, + "grad_norm": 0.6559115648269653, + "learning_rate": 0.00011543634162209178, + "loss": 0.9685, + "step": 5335 + }, + { + "epoch": 0.45, + "grad_norm": 0.6398196816444397, + "learning_rate": 0.00011530522746491132, + "loss": 0.8261, + "step": 5340 + }, + { + "epoch": 0.45, + "grad_norm": 0.6815221309661865, + "learning_rate": 0.00011517408635996241, + "loss": 0.8332, + "step": 5345 + }, + { + "epoch": 0.45, + "grad_norm": 0.6399449706077576, + "learning_rate": 0.00011504291853814393, + "loss": 0.9185, + "step": 5350 + }, + { + "epoch": 0.45, + "grad_norm": 0.6674028635025024, + "learning_rate": 0.00011491172423040178, + "loss": 0.8802, + "step": 5355 + }, + { + "epoch": 0.45, + "grad_norm": 0.6427600979804993, + "learning_rate": 0.00011478050366772855, + "loss": 0.9533, + "step": 5360 + }, + { + "epoch": 0.45, + "grad_norm": 0.8196895718574524, + "learning_rate": 0.00011464925708116306, + "loss": 0.9565, + "step": 5365 + }, + { + "epoch": 0.45, + "grad_norm": 0.675538957118988, + "learning_rate": 0.00011451798470178988, + "loss": 0.92, + "step": 5370 + }, + { + "epoch": 0.45, + "grad_norm": 0.6670119762420654, + "learning_rate": 0.0001143866867607391, + "loss": 0.8504, + "step": 5375 + }, + { + "epoch": 0.45, + "grad_norm": 0.6218076944351196, + "learning_rate": 0.0001142553634891857, + "loss": 0.8803, + "step": 5380 + }, + { + "epoch": 0.45, + "grad_norm": 0.7193536162376404, + "learning_rate": 0.00011412401511834934, + "loss": 0.9395, + "step": 5385 + }, + { + "epoch": 0.46, + "grad_norm": 0.6328010559082031, + "learning_rate": 0.00011399264187949385, + "loss": 0.9048, + "step": 5390 + }, + { + "epoch": 0.46, + "grad_norm": 0.6608994007110596, + "learning_rate": 0.00011386124400392686, + "loss": 0.8393, + "step": 5395 + }, + { + "epoch": 0.46, + "grad_norm": 0.6986910700798035, + "learning_rate": 0.0001137298217229993, + "loss": 0.9332, + "step": 5400 + }, + { + "epoch": 0.46, + "grad_norm": 0.9013976454734802, + "learning_rate": 0.00011359837526810521, + "loss": 0.9066, + "step": 5405 + }, + { + "epoch": 0.46, + "grad_norm": 0.7434113025665283, + "learning_rate": 0.00011346690487068103, + "loss": 0.9425, + "step": 5410 + }, + { + "epoch": 0.46, + "grad_norm": 0.6475106477737427, + "learning_rate": 0.00011333541076220555, + "loss": 0.8998, + "step": 5415 + }, + { + "epoch": 0.46, + "grad_norm": 0.5834239721298218, + "learning_rate": 0.00011320389317419908, + "loss": 0.8483, + "step": 5420 + }, + { + "epoch": 0.46, + "grad_norm": 0.786338746547699, + "learning_rate": 0.00011307235233822345, + "loss": 0.9344, + "step": 5425 + }, + { + "epoch": 0.46, + "grad_norm": 0.6710865497589111, + "learning_rate": 0.00011294078848588136, + "loss": 0.8631, + "step": 5430 + }, + { + "epoch": 0.46, + "grad_norm": 0.5884045958518982, + "learning_rate": 0.00011280920184881598, + "loss": 0.8094, + "step": 5435 + }, + { + "epoch": 0.46, + "grad_norm": 0.6520205140113831, + "learning_rate": 0.0001126775926587107, + "loss": 0.873, + "step": 5440 + }, + { + "epoch": 0.46, + "grad_norm": 0.6478862166404724, + "learning_rate": 0.00011254596114728859, + "loss": 0.8687, + "step": 5445 + }, + { + "epoch": 0.46, + "grad_norm": 0.7314843535423279, + "learning_rate": 0.00011241430754631194, + "loss": 0.9825, + "step": 5450 + }, + { + "epoch": 0.46, + "grad_norm": 0.6119475364685059, + "learning_rate": 0.00011228263208758206, + "loss": 0.8239, + "step": 5455 + }, + { + "epoch": 0.46, + "grad_norm": 0.6662307977676392, + "learning_rate": 0.0001121509350029386, + "loss": 0.775, + "step": 5460 + }, + { + "epoch": 0.46, + "grad_norm": 0.7896751165390015, + "learning_rate": 0.00011201921652425945, + "loss": 0.8355, + "step": 5465 + }, + { + "epoch": 0.46, + "grad_norm": 0.597769558429718, + "learning_rate": 0.00011188747688346002, + "loss": 0.9079, + "step": 5470 + }, + { + "epoch": 0.46, + "grad_norm": 0.7042331099510193, + "learning_rate": 0.00011175571631249305, + "loss": 0.9043, + "step": 5475 + }, + { + "epoch": 0.46, + "grad_norm": 0.7097181081771851, + "learning_rate": 0.00011162393504334814, + "loss": 0.8301, + "step": 5480 + }, + { + "epoch": 0.46, + "grad_norm": 0.773230791091919, + "learning_rate": 0.00011149213330805135, + "loss": 0.9735, + "step": 5485 + }, + { + "epoch": 0.46, + "grad_norm": 0.6911418437957764, + "learning_rate": 0.00011136031133866467, + "loss": 0.9265, + "step": 5490 + }, + { + "epoch": 0.46, + "grad_norm": 0.7387533783912659, + "learning_rate": 0.00011122846936728584, + "loss": 0.9248, + "step": 5495 + }, + { + "epoch": 0.46, + "grad_norm": 0.6310597658157349, + "learning_rate": 0.00011109660762604774, + "loss": 0.8217, + "step": 5500 + }, + { + "epoch": 0.47, + "grad_norm": 0.7292531728744507, + "learning_rate": 0.0001109647263471181, + "loss": 0.7904, + "step": 5505 + }, + { + "epoch": 0.47, + "grad_norm": 0.6988025903701782, + "learning_rate": 0.00011083282576269905, + "loss": 0.945, + "step": 5510 + }, + { + "epoch": 0.47, + "grad_norm": 0.7716386914253235, + "learning_rate": 0.00011070090610502663, + "loss": 1.0456, + "step": 5515 + }, + { + "epoch": 0.47, + "grad_norm": 0.6976988315582275, + "learning_rate": 0.00011056896760637063, + "loss": 0.9231, + "step": 5520 + }, + { + "epoch": 0.47, + "grad_norm": 0.5705205202102661, + "learning_rate": 0.00011043701049903381, + "loss": 0.9198, + "step": 5525 + }, + { + "epoch": 0.47, + "grad_norm": 0.6474895477294922, + "learning_rate": 0.00011030503501535186, + "loss": 0.9076, + "step": 5530 + }, + { + "epoch": 0.47, + "grad_norm": 0.8015474081039429, + "learning_rate": 0.00011017304138769272, + "loss": 1.0548, + "step": 5535 + }, + { + "epoch": 0.47, + "grad_norm": 0.7305599451065063, + "learning_rate": 0.00011004102984845635, + "loss": 0.8504, + "step": 5540 + }, + { + "epoch": 0.47, + "grad_norm": 0.735675573348999, + "learning_rate": 0.00010990900063007414, + "loss": 0.8588, + "step": 5545 + }, + { + "epoch": 0.47, + "grad_norm": 0.7288222312927246, + "learning_rate": 0.00010977695396500878, + "loss": 0.9225, + "step": 5550 + }, + { + "epoch": 0.47, + "grad_norm": 0.6950204372406006, + "learning_rate": 0.00010964489008575354, + "loss": 0.9049, + "step": 5555 + }, + { + "epoch": 0.47, + "grad_norm": 0.7171755433082581, + "learning_rate": 0.00010951280922483198, + "loss": 0.8445, + "step": 5560 + }, + { + "epoch": 0.47, + "grad_norm": 0.6393475532531738, + "learning_rate": 0.0001093807116147977, + "loss": 0.8743, + "step": 5565 + }, + { + "epoch": 0.47, + "grad_norm": 0.6673814058303833, + "learning_rate": 0.00010924859748823366, + "loss": 1.0077, + "step": 5570 + }, + { + "epoch": 0.47, + "grad_norm": 0.7003934979438782, + "learning_rate": 0.00010911646707775194, + "loss": 0.9263, + "step": 5575 + }, + { + "epoch": 0.47, + "grad_norm": 0.7368084788322449, + "learning_rate": 0.00010898432061599333, + "loss": 0.9174, + "step": 5580 + }, + { + "epoch": 0.47, + "grad_norm": 0.7135486602783203, + "learning_rate": 0.00010885215833562683, + "loss": 0.9149, + "step": 5585 + }, + { + "epoch": 0.47, + "grad_norm": 0.7837123274803162, + "learning_rate": 0.00010871998046934928, + "loss": 0.972, + "step": 5590 + }, + { + "epoch": 0.47, + "grad_norm": 0.7325451374053955, + "learning_rate": 0.00010858778724988506, + "loss": 0.9518, + "step": 5595 + }, + { + "epoch": 0.47, + "grad_norm": 0.725795328617096, + "learning_rate": 0.00010845557890998545, + "loss": 0.7896, + "step": 5600 + }, + { + "epoch": 0.47, + "grad_norm": 0.6287091374397278, + "learning_rate": 0.00010832335568242851, + "loss": 1.0537, + "step": 5605 + }, + { + "epoch": 0.47, + "grad_norm": 0.7267513871192932, + "learning_rate": 0.0001081911178000183, + "loss": 0.8986, + "step": 5610 + }, + { + "epoch": 0.47, + "grad_norm": 0.7996358871459961, + "learning_rate": 0.00010805886549558484, + "loss": 0.7822, + "step": 5615 + }, + { + "epoch": 0.47, + "grad_norm": 0.6031549572944641, + "learning_rate": 0.00010792659900198359, + "loss": 0.9068, + "step": 5620 + }, + { + "epoch": 0.48, + "grad_norm": 0.6607316136360168, + "learning_rate": 0.00010779431855209478, + "loss": 0.8688, + "step": 5625 + }, + { + "epoch": 0.48, + "grad_norm": 0.7166876196861267, + "learning_rate": 0.0001076620243788234, + "loss": 0.9289, + "step": 5630 + }, + { + "epoch": 0.48, + "grad_norm": 0.6892978549003601, + "learning_rate": 0.00010752971671509857, + "loss": 0.9349, + "step": 5635 + }, + { + "epoch": 0.48, + "grad_norm": 0.6517727375030518, + "learning_rate": 0.00010739739579387311, + "loss": 0.995, + "step": 5640 + }, + { + "epoch": 0.48, + "grad_norm": 0.7273731231689453, + "learning_rate": 0.00010726506184812322, + "loss": 0.9097, + "step": 5645 + }, + { + "epoch": 0.48, + "grad_norm": 0.5607343912124634, + "learning_rate": 0.00010713271511084797, + "loss": 0.8307, + "step": 5650 + }, + { + "epoch": 0.48, + "grad_norm": 0.6832279562950134, + "learning_rate": 0.00010700035581506908, + "loss": 0.9201, + "step": 5655 + }, + { + "epoch": 0.48, + "grad_norm": 0.7223308086395264, + "learning_rate": 0.00010686798419383027, + "loss": 0.8605, + "step": 5660 + }, + { + "epoch": 0.48, + "grad_norm": 0.6218956708908081, + "learning_rate": 0.00010673560048019693, + "loss": 0.8124, + "step": 5665 + }, + { + "epoch": 0.48, + "grad_norm": 0.5892270803451538, + "learning_rate": 0.0001066032049072559, + "loss": 0.882, + "step": 5670 + }, + { + "epoch": 0.48, + "grad_norm": 0.7258059978485107, + "learning_rate": 0.00010647079770811479, + "loss": 0.868, + "step": 5675 + }, + { + "epoch": 0.48, + "grad_norm": 0.8001238703727722, + "learning_rate": 0.00010633837911590163, + "loss": 0.8023, + "step": 5680 + }, + { + "epoch": 0.48, + "grad_norm": 0.7664066553115845, + "learning_rate": 0.00010620594936376466, + "loss": 0.9067, + "step": 5685 + }, + { + "epoch": 0.48, + "grad_norm": 0.5850561857223511, + "learning_rate": 0.00010607350868487165, + "loss": 0.824, + "step": 5690 + }, + { + "epoch": 0.48, + "grad_norm": 0.8648871183395386, + "learning_rate": 0.00010594105731240961, + "loss": 0.871, + "step": 5695 + }, + { + "epoch": 0.48, + "grad_norm": 0.6848756074905396, + "learning_rate": 0.00010580859547958448, + "loss": 0.7997, + "step": 5700 + }, + { + "epoch": 0.48, + "grad_norm": 0.9481160640716553, + "learning_rate": 0.00010567612341962048, + "loss": 0.9955, + "step": 5705 + }, + { + "epoch": 0.48, + "grad_norm": 0.7015474438667297, + "learning_rate": 0.00010554364136575998, + "loss": 0.9107, + "step": 5710 + }, + { + "epoch": 0.48, + "grad_norm": 0.7029824256896973, + "learning_rate": 0.00010541114955126284, + "loss": 0.9192, + "step": 5715 + }, + { + "epoch": 0.48, + "grad_norm": 0.6626318097114563, + "learning_rate": 0.00010527864820940608, + "loss": 0.8277, + "step": 5720 + }, + { + "epoch": 0.48, + "grad_norm": 0.7007662653923035, + "learning_rate": 0.00010514613757348364, + "loss": 0.9554, + "step": 5725 + }, + { + "epoch": 0.48, + "grad_norm": 0.5854273438453674, + "learning_rate": 0.0001050136178768057, + "loss": 0.8745, + "step": 5730 + }, + { + "epoch": 0.48, + "grad_norm": 0.8346757292747498, + "learning_rate": 0.00010488108935269843, + "loss": 1.0342, + "step": 5735 + }, + { + "epoch": 0.48, + "grad_norm": 0.6574100255966187, + "learning_rate": 0.00010474855223450355, + "loss": 0.9006, + "step": 5740 + }, + { + "epoch": 0.49, + "grad_norm": 0.5388402938842773, + "learning_rate": 0.0001046160067555779, + "loss": 0.9565, + "step": 5745 + }, + { + "epoch": 0.49, + "grad_norm": 0.6622834205627441, + "learning_rate": 0.00010448345314929301, + "loss": 0.9538, + "step": 5750 + }, + { + "epoch": 0.49, + "grad_norm": 0.6827812194824219, + "learning_rate": 0.00010435089164903484, + "loss": 0.9606, + "step": 5755 + }, + { + "epoch": 0.49, + "grad_norm": 0.7946076989173889, + "learning_rate": 0.00010421832248820309, + "loss": 0.9556, + "step": 5760 + }, + { + "epoch": 0.49, + "grad_norm": 0.6617308855056763, + "learning_rate": 0.00010408574590021101, + "loss": 0.9374, + "step": 5765 + }, + { + "epoch": 0.49, + "grad_norm": 0.8348833918571472, + "learning_rate": 0.000103953162118485, + "loss": 0.9182, + "step": 5770 + }, + { + "epoch": 0.49, + "grad_norm": 0.7646685838699341, + "learning_rate": 0.00010382057137646401, + "loss": 0.9838, + "step": 5775 + }, + { + "epoch": 0.49, + "grad_norm": 0.605670154094696, + "learning_rate": 0.00010368797390759937, + "loss": 0.7536, + "step": 5780 + }, + { + "epoch": 0.49, + "grad_norm": 0.7083485126495361, + "learning_rate": 0.0001035553699453541, + "loss": 0.9704, + "step": 5785 + }, + { + "epoch": 0.49, + "grad_norm": 0.6487541794776917, + "learning_rate": 0.00010342275972320276, + "loss": 0.9805, + "step": 5790 + }, + { + "epoch": 0.49, + "grad_norm": 0.6678948402404785, + "learning_rate": 0.00010329014347463097, + "loss": 0.9252, + "step": 5795 + }, + { + "epoch": 0.49, + "grad_norm": 0.7373467087745667, + "learning_rate": 0.00010315752143313479, + "loss": 0.8942, + "step": 5800 + }, + { + "epoch": 0.49, + "grad_norm": 0.6852657198905945, + "learning_rate": 0.00010302489383222065, + "loss": 0.8262, + "step": 5805 + }, + { + "epoch": 0.49, + "grad_norm": 0.7030515670776367, + "learning_rate": 0.00010289226090540473, + "loss": 0.7714, + "step": 5810 + }, + { + "epoch": 0.49, + "grad_norm": 0.7813947796821594, + "learning_rate": 0.00010275962288621251, + "loss": 0.9961, + "step": 5815 + }, + { + "epoch": 0.49, + "grad_norm": 0.7376470565795898, + "learning_rate": 0.00010262698000817852, + "loss": 0.8246, + "step": 5820 + }, + { + "epoch": 0.49, + "grad_norm": 0.6423072814941406, + "learning_rate": 0.00010249433250484579, + "loss": 0.8495, + "step": 5825 + }, + { + "epoch": 0.49, + "grad_norm": 0.703114926815033, + "learning_rate": 0.00010236168060976555, + "loss": 0.8891, + "step": 5830 + }, + { + "epoch": 0.49, + "grad_norm": 0.8897769451141357, + "learning_rate": 0.00010222902455649673, + "loss": 0.8899, + "step": 5835 + }, + { + "epoch": 0.49, + "grad_norm": 0.7237744927406311, + "learning_rate": 0.00010209636457860552, + "loss": 0.9736, + "step": 5840 + }, + { + "epoch": 0.49, + "grad_norm": 0.6191496253013611, + "learning_rate": 0.00010196370090966516, + "loss": 0.8626, + "step": 5845 + }, + { + "epoch": 0.49, + "grad_norm": 0.6924605965614319, + "learning_rate": 0.0001018310337832553, + "loss": 0.8597, + "step": 5850 + }, + { + "epoch": 0.49, + "grad_norm": 0.9236262440681458, + "learning_rate": 0.00010169836343296162, + "loss": 0.887, + "step": 5855 + }, + { + "epoch": 0.5, + "grad_norm": 0.792788565158844, + "learning_rate": 0.0001015656900923756, + "loss": 0.8942, + "step": 5860 + }, + { + "epoch": 0.5, + "grad_norm": 0.84331214427948, + "learning_rate": 0.00010143301399509395, + "loss": 0.9444, + "step": 5865 + }, + { + "epoch": 0.5, + "grad_norm": 0.5621626973152161, + "learning_rate": 0.00010130033537471815, + "loss": 0.9003, + "step": 5870 + }, + { + "epoch": 0.5, + "grad_norm": 0.6908104419708252, + "learning_rate": 0.00010116765446485423, + "loss": 0.8987, + "step": 5875 + }, + { + "epoch": 0.5, + "grad_norm": 0.6341535449028015, + "learning_rate": 0.0001010349714991122, + "loss": 0.7817, + "step": 5880 + }, + { + "epoch": 0.5, + "grad_norm": 0.7346782088279724, + "learning_rate": 0.00010090228671110568, + "loss": 1.0713, + "step": 5885 + }, + { + "epoch": 0.5, + "grad_norm": 0.6268035173416138, + "learning_rate": 0.00010076960033445155, + "loss": 0.8851, + "step": 5890 + }, + { + "epoch": 0.5, + "grad_norm": 0.7278366684913635, + "learning_rate": 0.0001006369126027694, + "loss": 0.8986, + "step": 5895 + }, + { + "epoch": 0.5, + "grad_norm": 0.7147312164306641, + "learning_rate": 0.00010050422374968131, + "loss": 0.7817, + "step": 5900 + }, + { + "epoch": 0.5, + "grad_norm": 0.6873770356178284, + "learning_rate": 0.00010037153400881126, + "loss": 0.9511, + "step": 5905 + }, + { + "epoch": 0.5, + "grad_norm": 0.6804821491241455, + "learning_rate": 0.00010023884361378477, + "loss": 0.9003, + "step": 5910 + }, + { + "epoch": 0.5, + "grad_norm": 0.7105894684791565, + "learning_rate": 0.00010010615279822865, + "loss": 0.9026, + "step": 5915 + }, + { + "epoch": 0.5, + "grad_norm": 0.7469001412391663, + "learning_rate": 9.99734617957703e-05, + "loss": 0.8842, + "step": 5920 + }, + { + "epoch": 0.5, + "grad_norm": 0.6915943026542664, + "learning_rate": 9.984077084003752e-05, + "loss": 0.9516, + "step": 5925 + }, + { + "epoch": 0.5, + "grad_norm": 0.7061002254486084, + "learning_rate": 9.970808016465797e-05, + "loss": 1.0175, + "step": 5930 + }, + { + "epoch": 0.5, + "grad_norm": 0.6731154322624207, + "learning_rate": 9.957539000325893e-05, + "loss": 1.0213, + "step": 5935 + }, + { + "epoch": 0.5, + "grad_norm": 0.8961304426193237, + "learning_rate": 9.944270058946666e-05, + "loss": 1.0686, + "step": 5940 + }, + { + "epoch": 0.5, + "grad_norm": 0.7615440487861633, + "learning_rate": 9.931001215690616e-05, + "loss": 0.9474, + "step": 5945 + }, + { + "epoch": 0.5, + "grad_norm": 0.6351935863494873, + "learning_rate": 9.917732493920071e-05, + "loss": 0.7973, + "step": 5950 + }, + { + "epoch": 0.5, + "grad_norm": 0.6528865694999695, + "learning_rate": 9.90446391699714e-05, + "loss": 1.0574, + "step": 5955 + }, + { + "epoch": 0.5, + "grad_norm": 0.8191664814949036, + "learning_rate": 9.891195508283684e-05, + "loss": 0.8424, + "step": 5960 + }, + { + "epoch": 0.5, + "grad_norm": 0.6079810261726379, + "learning_rate": 9.877927291141261e-05, + "loss": 0.8154, + "step": 5965 + }, + { + "epoch": 0.5, + "grad_norm": 0.7904455065727234, + "learning_rate": 9.864659288931095e-05, + "loss": 0.8835, + "step": 5970 + }, + { + "epoch": 0.5, + "grad_norm": 0.704140305519104, + "learning_rate": 9.851391525014035e-05, + "loss": 0.96, + "step": 5975 + }, + { + "epoch": 0.51, + "grad_norm": 0.714992880821228, + "learning_rate": 9.838124022750502e-05, + "loss": 0.9106, + "step": 5980 + }, + { + "epoch": 0.51, + "grad_norm": 0.5958659052848816, + "learning_rate": 9.824856805500462e-05, + "loss": 0.9694, + "step": 5985 + }, + { + "epoch": 0.51, + "grad_norm": 0.7285751104354858, + "learning_rate": 9.811589896623382e-05, + "loss": 0.9694, + "step": 5990 + }, + { + "epoch": 0.51, + "grad_norm": 0.8631846904754639, + "learning_rate": 9.798323319478178e-05, + "loss": 0.7295, + "step": 5995 + }, + { + "epoch": 0.51, + "grad_norm": 0.6519853472709656, + "learning_rate": 9.785057097423186e-05, + "loss": 0.8604, + "step": 6000 + }, + { + "epoch": 0.51, + "grad_norm": 0.6751505732536316, + "learning_rate": 9.771791253816123e-05, + "loss": 0.8958, + "step": 6005 + }, + { + "epoch": 0.51, + "grad_norm": 0.6848597526550293, + "learning_rate": 9.758525812014029e-05, + "loss": 0.8678, + "step": 6010 + }, + { + "epoch": 0.51, + "grad_norm": 0.8685235381126404, + "learning_rate": 9.745260795373239e-05, + "loss": 0.9, + "step": 6015 + }, + { + "epoch": 0.51, + "grad_norm": 0.660624086856842, + "learning_rate": 9.731996227249347e-05, + "loss": 0.9081, + "step": 6020 + }, + { + "epoch": 0.51, + "grad_norm": 0.8399864435195923, + "learning_rate": 9.718732130997148e-05, + "loss": 0.9256, + "step": 6025 + }, + { + "epoch": 0.51, + "grad_norm": 0.6387921571731567, + "learning_rate": 9.705468529970613e-05, + "loss": 0.9444, + "step": 6030 + }, + { + "epoch": 0.51, + "grad_norm": 0.7852475047111511, + "learning_rate": 9.692205447522837e-05, + "loss": 1.0471, + "step": 6035 + }, + { + "epoch": 0.51, + "grad_norm": 0.7889160513877869, + "learning_rate": 9.678942907006002e-05, + "loss": 0.8986, + "step": 6040 + }, + { + "epoch": 0.51, + "grad_norm": 0.7621517181396484, + "learning_rate": 9.665680931771341e-05, + "loss": 0.9484, + "step": 6045 + }, + { + "epoch": 0.51, + "grad_norm": 0.6188884973526001, + "learning_rate": 9.652419545169083e-05, + "loss": 0.8797, + "step": 6050 + }, + { + "epoch": 0.51, + "grad_norm": 0.7267559766769409, + "learning_rate": 9.639158770548426e-05, + "loss": 0.8975, + "step": 6055 + }, + { + "epoch": 0.51, + "grad_norm": 0.8356955051422119, + "learning_rate": 9.625898631257492e-05, + "loss": 0.9743, + "step": 6060 + }, + { + "epoch": 0.51, + "grad_norm": 0.7362446188926697, + "learning_rate": 9.612639150643282e-05, + "loss": 0.8549, + "step": 6065 + }, + { + "epoch": 0.51, + "grad_norm": 0.7240428328514099, + "learning_rate": 9.599380352051633e-05, + "loss": 0.9337, + "step": 6070 + }, + { + "epoch": 0.51, + "grad_norm": 0.6012915968894958, + "learning_rate": 9.586122258827193e-05, + "loss": 0.9229, + "step": 6075 + }, + { + "epoch": 0.51, + "grad_norm": 0.8492560982704163, + "learning_rate": 9.572864894313357e-05, + "loss": 0.8502, + "step": 6080 + }, + { + "epoch": 0.51, + "grad_norm": 0.8213250041007996, + "learning_rate": 9.559608281852238e-05, + "loss": 0.9357, + "step": 6085 + }, + { + "epoch": 0.51, + "grad_norm": 0.7091009616851807, + "learning_rate": 9.546352444784632e-05, + "loss": 0.8587, + "step": 6090 + }, + { + "epoch": 0.51, + "grad_norm": 0.7153491973876953, + "learning_rate": 9.533097406449962e-05, + "loss": 0.9584, + "step": 6095 + }, + { + "epoch": 0.52, + "grad_norm": 0.7168654203414917, + "learning_rate": 9.519843190186249e-05, + "loss": 0.9244, + "step": 6100 + }, + { + "epoch": 0.52, + "grad_norm": 0.692661464214325, + "learning_rate": 9.506589819330069e-05, + "loss": 0.8989, + "step": 6105 + }, + { + "epoch": 0.52, + "grad_norm": 0.6731054186820984, + "learning_rate": 9.493337317216498e-05, + "loss": 0.8934, + "step": 6110 + }, + { + "epoch": 0.52, + "grad_norm": 0.665674090385437, + "learning_rate": 9.4800857071791e-05, + "loss": 0.8809, + "step": 6115 + }, + { + "epoch": 0.52, + "grad_norm": 0.6796616911888123, + "learning_rate": 9.466835012549855e-05, + "loss": 0.777, + "step": 6120 + }, + { + "epoch": 0.52, + "grad_norm": 0.7332762479782104, + "learning_rate": 9.453585256659127e-05, + "loss": 0.8262, + "step": 6125 + }, + { + "epoch": 0.52, + "grad_norm": 0.6969581246376038, + "learning_rate": 9.440336462835648e-05, + "loss": 0.898, + "step": 6130 + }, + { + "epoch": 0.52, + "grad_norm": 0.7141453623771667, + "learning_rate": 9.42708865440644e-05, + "loss": 0.9318, + "step": 6135 + }, + { + "epoch": 0.52, + "grad_norm": 0.6876282691955566, + "learning_rate": 9.413841854696785e-05, + "loss": 0.8526, + "step": 6140 + }, + { + "epoch": 0.52, + "grad_norm": 0.7855693697929382, + "learning_rate": 9.400596087030207e-05, + "loss": 0.8262, + "step": 6145 + }, + { + "epoch": 0.52, + "grad_norm": 0.7895783185958862, + "learning_rate": 9.387351374728403e-05, + "loss": 0.8961, + "step": 6150 + }, + { + "epoch": 0.52, + "grad_norm": 0.6923259496688843, + "learning_rate": 9.3741077411112e-05, + "loss": 0.9048, + "step": 6155 + }, + { + "epoch": 0.52, + "grad_norm": 0.789193868637085, + "learning_rate": 9.360865209496554e-05, + "loss": 0.8957, + "step": 6160 + }, + { + "epoch": 0.52, + "grad_norm": 0.7600136995315552, + "learning_rate": 9.347623803200456e-05, + "loss": 1.0295, + "step": 6165 + }, + { + "epoch": 0.52, + "grad_norm": 0.5362326502799988, + "learning_rate": 9.334383545536918e-05, + "loss": 0.8746, + "step": 6170 + }, + { + "epoch": 0.52, + "grad_norm": 0.7427657842636108, + "learning_rate": 9.321144459817952e-05, + "loss": 0.993, + "step": 6175 + }, + { + "epoch": 0.52, + "grad_norm": 0.7276807427406311, + "learning_rate": 9.307906569353474e-05, + "loss": 0.9435, + "step": 6180 + }, + { + "epoch": 0.52, + "grad_norm": 0.7534928917884827, + "learning_rate": 9.294669897451324e-05, + "loss": 0.9096, + "step": 6185 + }, + { + "epoch": 0.52, + "grad_norm": 0.6962252855300903, + "learning_rate": 9.281434467417181e-05, + "loss": 0.8884, + "step": 6190 + }, + { + "epoch": 0.52, + "grad_norm": 0.6648967862129211, + "learning_rate": 9.268200302554533e-05, + "loss": 0.881, + "step": 6195 + }, + { + "epoch": 0.52, + "grad_norm": 0.7699589729309082, + "learning_rate": 9.254967426164661e-05, + "loss": 0.9009, + "step": 6200 + }, + { + "epoch": 0.52, + "grad_norm": 0.7004358172416687, + "learning_rate": 9.241735861546555e-05, + "loss": 0.9127, + "step": 6205 + }, + { + "epoch": 0.52, + "grad_norm": 0.7334666848182678, + "learning_rate": 9.228505631996905e-05, + "loss": 1.0146, + "step": 6210 + }, + { + "epoch": 0.53, + "grad_norm": 0.7301573753356934, + "learning_rate": 9.215276760810061e-05, + "loss": 0.9073, + "step": 6215 + }, + { + "epoch": 0.53, + "grad_norm": 0.6607963442802429, + "learning_rate": 9.202049271277961e-05, + "loss": 0.8612, + "step": 6220 + }, + { + "epoch": 0.53, + "grad_norm": 0.759436845779419, + "learning_rate": 9.188823186690117e-05, + "loss": 0.8869, + "step": 6225 + }, + { + "epoch": 0.53, + "grad_norm": 0.5646474957466125, + "learning_rate": 9.175598530333582e-05, + "loss": 0.8658, + "step": 6230 + }, + { + "epoch": 0.53, + "grad_norm": 0.654805600643158, + "learning_rate": 9.162375325492875e-05, + "loss": 1.0176, + "step": 6235 + }, + { + "epoch": 0.53, + "grad_norm": 0.6620368361473083, + "learning_rate": 9.149153595449968e-05, + "loss": 0.8632, + "step": 6240 + }, + { + "epoch": 0.53, + "grad_norm": 0.7386505007743835, + "learning_rate": 9.135933363484236e-05, + "loss": 0.9617, + "step": 6245 + }, + { + "epoch": 0.53, + "grad_norm": 0.8237467408180237, + "learning_rate": 9.122714652872412e-05, + "loss": 0.9263, + "step": 6250 + }, + { + "epoch": 0.53, + "grad_norm": 0.701260507106781, + "learning_rate": 9.109497486888564e-05, + "loss": 0.8149, + "step": 6255 + }, + { + "epoch": 0.53, + "grad_norm": 0.6147468090057373, + "learning_rate": 9.096281888804022e-05, + "loss": 0.972, + "step": 6260 + }, + { + "epoch": 0.53, + "grad_norm": 0.7591169476509094, + "learning_rate": 9.083067881887365e-05, + "loss": 0.8234, + "step": 6265 + }, + { + "epoch": 0.53, + "grad_norm": 0.7122942805290222, + "learning_rate": 9.069855489404372e-05, + "loss": 0.8351, + "step": 6270 + }, + { + "epoch": 0.53, + "grad_norm": 0.8314396739006042, + "learning_rate": 9.056644734617975e-05, + "loss": 0.9396, + "step": 6275 + }, + { + "epoch": 0.53, + "grad_norm": 1.0538718700408936, + "learning_rate": 9.043435640788222e-05, + "loss": 0.9465, + "step": 6280 + }, + { + "epoch": 0.53, + "grad_norm": 0.7261043190956116, + "learning_rate": 9.030228231172245e-05, + "loss": 0.935, + "step": 6285 + }, + { + "epoch": 0.53, + "grad_norm": 0.7041414976119995, + "learning_rate": 9.0170225290242e-05, + "loss": 0.895, + "step": 6290 + }, + { + "epoch": 0.53, + "grad_norm": 0.7024123668670654, + "learning_rate": 9.003818557595241e-05, + "loss": 0.8386, + "step": 6295 + }, + { + "epoch": 0.53, + "grad_norm": 0.6621682047843933, + "learning_rate": 8.990616340133478e-05, + "loss": 0.8501, + "step": 6300 + }, + { + "epoch": 0.53, + "grad_norm": 0.7288509011268616, + "learning_rate": 8.977415899883928e-05, + "loss": 0.8765, + "step": 6305 + }, + { + "epoch": 0.53, + "grad_norm": 0.7058336138725281, + "learning_rate": 8.964217260088479e-05, + "loss": 0.8252, + "step": 6310 + }, + { + "epoch": 0.53, + "grad_norm": 0.6799612045288086, + "learning_rate": 8.951020443985854e-05, + "loss": 0.9172, + "step": 6315 + }, + { + "epoch": 0.53, + "grad_norm": 0.6987103223800659, + "learning_rate": 8.937825474811558e-05, + "loss": 0.8024, + "step": 6320 + }, + { + "epoch": 0.53, + "grad_norm": 0.7850658893585205, + "learning_rate": 8.924632375797852e-05, + "loss": 0.8931, + "step": 6325 + }, + { + "epoch": 0.53, + "grad_norm": 0.5284181833267212, + "learning_rate": 8.911441170173698e-05, + "loss": 0.732, + "step": 6330 + }, + { + "epoch": 0.54, + "grad_norm": 0.6844927668571472, + "learning_rate": 8.898251881164723e-05, + "loss": 0.9311, + "step": 6335 + }, + { + "epoch": 0.54, + "grad_norm": 0.692069947719574, + "learning_rate": 8.88506453199319e-05, + "loss": 0.8758, + "step": 6340 + }, + { + "epoch": 0.54, + "grad_norm": 0.6895598769187927, + "learning_rate": 8.871879145877933e-05, + "loss": 0.8757, + "step": 6345 + }, + { + "epoch": 0.54, + "grad_norm": 0.6951631307601929, + "learning_rate": 8.858695746034336e-05, + "loss": 0.8794, + "step": 6350 + }, + { + "epoch": 0.54, + "grad_norm": 0.6704897284507751, + "learning_rate": 8.84551435567429e-05, + "loss": 0.9, + "step": 6355 + }, + { + "epoch": 0.54, + "grad_norm": 0.6468834280967712, + "learning_rate": 8.832334998006143e-05, + "loss": 0.8502, + "step": 6360 + }, + { + "epoch": 0.54, + "grad_norm": 0.6603518128395081, + "learning_rate": 8.819157696234659e-05, + "loss": 0.8134, + "step": 6365 + }, + { + "epoch": 0.54, + "grad_norm": 0.8900942802429199, + "learning_rate": 8.805982473560996e-05, + "loss": 0.8732, + "step": 6370 + }, + { + "epoch": 0.54, + "grad_norm": 0.7345215082168579, + "learning_rate": 8.792809353182638e-05, + "loss": 0.9701, + "step": 6375 + }, + { + "epoch": 0.54, + "grad_norm": 0.6759326457977295, + "learning_rate": 8.779638358293374e-05, + "loss": 0.9155, + "step": 6380 + }, + { + "epoch": 0.54, + "grad_norm": 0.6166871786117554, + "learning_rate": 8.766469512083251e-05, + "loss": 0.8308, + "step": 6385 + }, + { + "epoch": 0.54, + "grad_norm": 0.7100403308868408, + "learning_rate": 8.753302837738527e-05, + "loss": 0.8684, + "step": 6390 + }, + { + "epoch": 0.54, + "grad_norm": 0.7487990260124207, + "learning_rate": 8.740138358441648e-05, + "loss": 0.9515, + "step": 6395 + }, + { + "epoch": 0.54, + "grad_norm": 0.7730888724327087, + "learning_rate": 8.72697609737118e-05, + "loss": 0.8772, + "step": 6400 + }, + { + "epoch": 0.54, + "grad_norm": 0.643085241317749, + "learning_rate": 8.713816077701792e-05, + "loss": 0.8361, + "step": 6405 + }, + { + "epoch": 0.54, + "grad_norm": 0.7037286758422852, + "learning_rate": 8.700658322604211e-05, + "loss": 0.8065, + "step": 6410 + }, + { + "epoch": 0.54, + "grad_norm": 0.7489007711410522, + "learning_rate": 8.687502855245169e-05, + "loss": 0.9188, + "step": 6415 + }, + { + "epoch": 0.54, + "grad_norm": 0.6815351843833923, + "learning_rate": 8.674349698787366e-05, + "loss": 0.9211, + "step": 6420 + }, + { + "epoch": 0.54, + "grad_norm": 0.7705895900726318, + "learning_rate": 8.661198876389448e-05, + "loss": 0.9211, + "step": 6425 + }, + { + "epoch": 0.54, + "grad_norm": 0.6026811003684998, + "learning_rate": 8.64805041120594e-05, + "loss": 0.9545, + "step": 6430 + }, + { + "epoch": 0.54, + "grad_norm": 0.725632905960083, + "learning_rate": 8.634904326387216e-05, + "loss": 0.8984, + "step": 6435 + }, + { + "epoch": 0.54, + "grad_norm": 0.8397646546363831, + "learning_rate": 8.621760645079468e-05, + "loss": 0.9554, + "step": 6440 + }, + { + "epoch": 0.54, + "grad_norm": 0.6730992197990417, + "learning_rate": 8.608619390424648e-05, + "loss": 0.6669, + "step": 6445 + }, + { + "epoch": 0.54, + "grad_norm": 0.7317922711372375, + "learning_rate": 8.595480585560438e-05, + "loss": 0.8063, + "step": 6450 + }, + { + "epoch": 0.55, + "grad_norm": 0.7895035743713379, + "learning_rate": 8.582344253620208e-05, + "loss": 0.9371, + "step": 6455 + }, + { + "epoch": 0.55, + "grad_norm": 0.605285108089447, + "learning_rate": 8.569210417732975e-05, + "loss": 0.6815, + "step": 6460 + }, + { + "epoch": 0.55, + "grad_norm": 0.689128577709198, + "learning_rate": 8.556079101023348e-05, + "loss": 0.8454, + "step": 6465 + }, + { + "epoch": 0.55, + "grad_norm": 0.9090960025787354, + "learning_rate": 8.542950326611525e-05, + "loss": 0.862, + "step": 6470 + }, + { + "epoch": 0.55, + "grad_norm": 0.7995282411575317, + "learning_rate": 8.529824117613208e-05, + "loss": 0.9217, + "step": 6475 + }, + { + "epoch": 0.55, + "grad_norm": 0.821121096611023, + "learning_rate": 8.516700497139589e-05, + "loss": 0.954, + "step": 6480 + }, + { + "epoch": 0.55, + "grad_norm": 0.7355401515960693, + "learning_rate": 8.503579488297304e-05, + "loss": 0.7894, + "step": 6485 + }, + { + "epoch": 0.55, + "grad_norm": 0.7227281928062439, + "learning_rate": 8.490461114188383e-05, + "loss": 0.965, + "step": 6490 + }, + { + "epoch": 0.55, + "grad_norm": 0.5704253315925598, + "learning_rate": 8.477345397910229e-05, + "loss": 0.9453, + "step": 6495 + }, + { + "epoch": 0.55, + "grad_norm": 0.6398398876190186, + "learning_rate": 8.464232362555557e-05, + "loss": 0.8524, + "step": 6500 + }, + { + "epoch": 0.55, + "grad_norm": 0.6368110775947571, + "learning_rate": 8.451122031212357e-05, + "loss": 0.9498, + "step": 6505 + }, + { + "epoch": 0.55, + "grad_norm": 0.6170530915260315, + "learning_rate": 8.438014426963874e-05, + "loss": 0.9573, + "step": 6510 + }, + { + "epoch": 0.55, + "grad_norm": 0.6524907946586609, + "learning_rate": 8.424909572888542e-05, + "loss": 0.8642, + "step": 6515 + }, + { + "epoch": 0.55, + "grad_norm": 0.6852608323097229, + "learning_rate": 8.411807492059944e-05, + "loss": 0.8879, + "step": 6520 + }, + { + "epoch": 0.55, + "grad_norm": 0.6991391777992249, + "learning_rate": 8.398708207546797e-05, + "loss": 0.9535, + "step": 6525 + }, + { + "epoch": 0.55, + "grad_norm": 0.5600320100784302, + "learning_rate": 8.385611742412887e-05, + "loss": 0.8784, + "step": 6530 + }, + { + "epoch": 0.55, + "grad_norm": 0.8736658692359924, + "learning_rate": 8.372518119717027e-05, + "loss": 1.0701, + "step": 6535 + }, + { + "epoch": 0.55, + "grad_norm": 0.6932893991470337, + "learning_rate": 8.359427362513046e-05, + "loss": 0.8919, + "step": 6540 + }, + { + "epoch": 0.55, + "grad_norm": 0.7107034921646118, + "learning_rate": 8.346339493849704e-05, + "loss": 0.8767, + "step": 6545 + }, + { + "epoch": 0.55, + "grad_norm": 0.7890344262123108, + "learning_rate": 8.333254536770696e-05, + "loss": 0.8633, + "step": 6550 + }, + { + "epoch": 0.55, + "grad_norm": 0.6766764521598816, + "learning_rate": 8.320172514314581e-05, + "loss": 0.8874, + "step": 6555 + }, + { + "epoch": 0.55, + "grad_norm": 0.7544351816177368, + "learning_rate": 8.307093449514743e-05, + "loss": 0.8401, + "step": 6560 + }, + { + "epoch": 0.55, + "grad_norm": 0.6718683838844299, + "learning_rate": 8.294017365399377e-05, + "loss": 0.9419, + "step": 6565 + }, + { + "epoch": 0.55, + "grad_norm": 0.667589545249939, + "learning_rate": 8.280944284991418e-05, + "loss": 0.9142, + "step": 6570 + }, + { + "epoch": 0.56, + "grad_norm": 0.7825703620910645, + "learning_rate": 8.267874231308506e-05, + "loss": 1.0079, + "step": 6575 + }, + { + "epoch": 0.56, + "grad_norm": 0.6643834114074707, + "learning_rate": 8.254807227362973e-05, + "loss": 0.7752, + "step": 6580 + }, + { + "epoch": 0.56, + "grad_norm": 0.6645832657814026, + "learning_rate": 8.241743296161759e-05, + "loss": 0.8748, + "step": 6585 + }, + { + "epoch": 0.56, + "grad_norm": 0.6274651885032654, + "learning_rate": 8.228682460706403e-05, + "loss": 1.0167, + "step": 6590 + }, + { + "epoch": 0.56, + "grad_norm": 0.8026042580604553, + "learning_rate": 8.215624743993003e-05, + "loss": 0.8749, + "step": 6595 + }, + { + "epoch": 0.56, + "grad_norm": 0.7438853979110718, + "learning_rate": 8.20257016901215e-05, + "loss": 0.8102, + "step": 6600 + }, + { + "epoch": 0.56, + "grad_norm": 0.8356721997261047, + "learning_rate": 8.189518758748908e-05, + "loss": 0.884, + "step": 6605 + }, + { + "epoch": 0.56, + "grad_norm": 0.7413963675498962, + "learning_rate": 8.176470536182777e-05, + "loss": 0.8673, + "step": 6610 + }, + { + "epoch": 0.56, + "grad_norm": 0.8624594211578369, + "learning_rate": 8.163425524287628e-05, + "loss": 0.905, + "step": 6615 + }, + { + "epoch": 0.56, + "grad_norm": 0.7963995933532715, + "learning_rate": 8.150383746031707e-05, + "loss": 0.9699, + "step": 6620 + }, + { + "epoch": 0.56, + "grad_norm": 0.6565963625907898, + "learning_rate": 8.137345224377536e-05, + "loss": 0.9647, + "step": 6625 + }, + { + "epoch": 0.56, + "grad_norm": 0.827908992767334, + "learning_rate": 8.124309982281914e-05, + "loss": 0.9212, + "step": 6630 + }, + { + "epoch": 0.56, + "grad_norm": 0.6607983708381653, + "learning_rate": 8.111278042695881e-05, + "loss": 0.7614, + "step": 6635 + }, + { + "epoch": 0.56, + "grad_norm": 0.7446150779724121, + "learning_rate": 8.098249428564635e-05, + "loss": 0.8997, + "step": 6640 + }, + { + "epoch": 0.56, + "grad_norm": 0.6742003560066223, + "learning_rate": 8.08522416282754e-05, + "loss": 0.9889, + "step": 6645 + }, + { + "epoch": 0.56, + "grad_norm": 0.6623156666755676, + "learning_rate": 8.072202268418057e-05, + "loss": 0.8356, + "step": 6650 + }, + { + "epoch": 0.56, + "grad_norm": 1.0455914735794067, + "learning_rate": 8.059183768263712e-05, + "loss": 0.7725, + "step": 6655 + }, + { + "epoch": 0.56, + "grad_norm": 0.6387537121772766, + "learning_rate": 8.046168685286052e-05, + "loss": 0.8654, + "step": 6660 + }, + { + "epoch": 0.56, + "grad_norm": 0.7108795642852783, + "learning_rate": 8.033157042400613e-05, + "loss": 0.7349, + "step": 6665 + }, + { + "epoch": 0.56, + "grad_norm": 0.7971534729003906, + "learning_rate": 8.02014886251687e-05, + "loss": 0.8774, + "step": 6670 + }, + { + "epoch": 0.56, + "grad_norm": 0.7332680821418762, + "learning_rate": 8.007144168538198e-05, + "loss": 0.8998, + "step": 6675 + }, + { + "epoch": 0.56, + "grad_norm": 0.7816454768180847, + "learning_rate": 7.994142983361843e-05, + "loss": 0.8344, + "step": 6680 + }, + { + "epoch": 0.56, + "grad_norm": 0.7597826719284058, + "learning_rate": 7.981145329878867e-05, + "loss": 0.9178, + "step": 6685 + }, + { + "epoch": 0.57, + "grad_norm": 0.6635742783546448, + "learning_rate": 7.96815123097411e-05, + "loss": 0.79, + "step": 6690 + }, + { + "epoch": 0.57, + "grad_norm": 0.6543397307395935, + "learning_rate": 7.955160709526167e-05, + "loss": 0.9132, + "step": 6695 + }, + { + "epoch": 0.57, + "grad_norm": 0.7143847346305847, + "learning_rate": 7.942173788407318e-05, + "loss": 0.8708, + "step": 6700 + }, + { + "epoch": 0.57, + "grad_norm": 0.8613307476043701, + "learning_rate": 7.929190490483517e-05, + "loss": 0.9289, + "step": 6705 + }, + { + "epoch": 0.57, + "grad_norm": 0.7165137529373169, + "learning_rate": 7.916210838614331e-05, + "loss": 0.9139, + "step": 6710 + }, + { + "epoch": 0.57, + "grad_norm": 0.7924880385398865, + "learning_rate": 7.903234855652907e-05, + "loss": 0.9247, + "step": 6715 + }, + { + "epoch": 0.57, + "grad_norm": 0.7915977835655212, + "learning_rate": 7.890262564445939e-05, + "loss": 0.8209, + "step": 6720 + }, + { + "epoch": 0.57, + "grad_norm": 0.6632764339447021, + "learning_rate": 7.877293987833617e-05, + "loss": 0.8967, + "step": 6725 + }, + { + "epoch": 0.57, + "grad_norm": 0.759326159954071, + "learning_rate": 7.864329148649584e-05, + "loss": 0.8654, + "step": 6730 + }, + { + "epoch": 0.57, + "grad_norm": 0.6976078748703003, + "learning_rate": 7.851368069720917e-05, + "loss": 0.8969, + "step": 6735 + }, + { + "epoch": 0.57, + "grad_norm": 0.8202966451644897, + "learning_rate": 7.838410773868061e-05, + "loss": 0.7885, + "step": 6740 + }, + { + "epoch": 0.57, + "grad_norm": 0.7911727428436279, + "learning_rate": 7.825457283904802e-05, + "loss": 0.9239, + "step": 6745 + }, + { + "epoch": 0.57, + "grad_norm": 0.7848120331764221, + "learning_rate": 7.81250762263823e-05, + "loss": 0.9427, + "step": 6750 + }, + { + "epoch": 0.57, + "grad_norm": 0.6924152970314026, + "learning_rate": 7.799561812868691e-05, + "loss": 0.8516, + "step": 6755 + }, + { + "epoch": 0.57, + "grad_norm": 0.8438791036605835, + "learning_rate": 7.786619877389742e-05, + "loss": 0.9296, + "step": 6760 + }, + { + "epoch": 0.57, + "grad_norm": 0.7251960635185242, + "learning_rate": 7.773681838988136e-05, + "loss": 0.7171, + "step": 6765 + }, + { + "epoch": 0.57, + "grad_norm": 0.8076329231262207, + "learning_rate": 7.760747720443744e-05, + "loss": 0.8795, + "step": 6770 + }, + { + "epoch": 0.57, + "grad_norm": 0.7533569931983948, + "learning_rate": 7.747817544529555e-05, + "loss": 0.8433, + "step": 6775 + }, + { + "epoch": 0.57, + "grad_norm": 0.6897385716438293, + "learning_rate": 7.7348913340116e-05, + "loss": 0.7796, + "step": 6780 + }, + { + "epoch": 0.57, + "grad_norm": 0.7647858262062073, + "learning_rate": 7.721969111648936e-05, + "loss": 0.9444, + "step": 6785 + }, + { + "epoch": 0.57, + "grad_norm": 0.8716790676116943, + "learning_rate": 7.709050900193601e-05, + "loss": 0.9598, + "step": 6790 + }, + { + "epoch": 0.57, + "grad_norm": 0.6649170517921448, + "learning_rate": 7.696136722390566e-05, + "loss": 0.7195, + "step": 6795 + }, + { + "epoch": 0.57, + "grad_norm": 0.8289276361465454, + "learning_rate": 7.683226600977695e-05, + "loss": 1.0056, + "step": 6800 + }, + { + "epoch": 0.57, + "grad_norm": 0.6001815795898438, + "learning_rate": 7.670320558685724e-05, + "loss": 0.739, + "step": 6805 + }, + { + "epoch": 0.58, + "grad_norm": 0.7629383206367493, + "learning_rate": 7.657418618238196e-05, + "loss": 0.871, + "step": 6810 + }, + { + "epoch": 0.58, + "grad_norm": 0.7336935997009277, + "learning_rate": 7.644520802351431e-05, + "loss": 0.9111, + "step": 6815 + }, + { + "epoch": 0.58, + "grad_norm": 0.6389930248260498, + "learning_rate": 7.631627133734497e-05, + "loss": 0.8693, + "step": 6820 + }, + { + "epoch": 0.58, + "grad_norm": 0.7814512848854065, + "learning_rate": 7.61873763508915e-05, + "loss": 0.9354, + "step": 6825 + }, + { + "epoch": 0.58, + "grad_norm": 0.8012021780014038, + "learning_rate": 7.605852329109808e-05, + "loss": 0.9172, + "step": 6830 + }, + { + "epoch": 0.58, + "grad_norm": 0.7072709798812866, + "learning_rate": 7.592971238483508e-05, + "loss": 0.7958, + "step": 6835 + }, + { + "epoch": 0.58, + "grad_norm": 0.7154051065444946, + "learning_rate": 7.580094385889862e-05, + "loss": 0.9477, + "step": 6840 + }, + { + "epoch": 0.58, + "grad_norm": 0.7569812536239624, + "learning_rate": 7.567221794001025e-05, + "loss": 0.8842, + "step": 6845 + }, + { + "epoch": 0.58, + "grad_norm": 0.7594531178474426, + "learning_rate": 7.554353485481646e-05, + "loss": 0.8449, + "step": 6850 + }, + { + "epoch": 0.58, + "grad_norm": 0.712028443813324, + "learning_rate": 7.54148948298883e-05, + "loss": 0.9318, + "step": 6855 + }, + { + "epoch": 0.58, + "grad_norm": 0.8297650814056396, + "learning_rate": 7.528629809172109e-05, + "loss": 0.8338, + "step": 6860 + }, + { + "epoch": 0.58, + "grad_norm": 0.6936589479446411, + "learning_rate": 7.515774486673386e-05, + "loss": 0.9181, + "step": 6865 + }, + { + "epoch": 0.58, + "grad_norm": 0.7724419832229614, + "learning_rate": 7.502923538126903e-05, + "loss": 1.0482, + "step": 6870 + }, + { + "epoch": 0.58, + "grad_norm": 0.7306141257286072, + "learning_rate": 7.490076986159207e-05, + "loss": 0.9175, + "step": 6875 + }, + { + "epoch": 0.58, + "grad_norm": 0.8186717629432678, + "learning_rate": 7.477234853389099e-05, + "loss": 0.9299, + "step": 6880 + }, + { + "epoch": 0.58, + "grad_norm": 0.7106767892837524, + "learning_rate": 7.464397162427595e-05, + "loss": 0.8991, + "step": 6885 + }, + { + "epoch": 0.58, + "grad_norm": 0.7082087993621826, + "learning_rate": 7.451563935877901e-05, + "loss": 0.9664, + "step": 6890 + }, + { + "epoch": 0.58, + "grad_norm": 0.6625487804412842, + "learning_rate": 7.438735196335361e-05, + "loss": 0.9274, + "step": 6895 + }, + { + "epoch": 0.58, + "grad_norm": 0.683401346206665, + "learning_rate": 7.425910966387399e-05, + "loss": 0.8408, + "step": 6900 + }, + { + "epoch": 0.58, + "grad_norm": 0.7105583548545837, + "learning_rate": 7.413091268613535e-05, + "loss": 0.8892, + "step": 6905 + }, + { + "epoch": 0.58, + "grad_norm": 0.7126787900924683, + "learning_rate": 7.400276125585275e-05, + "loss": 0.8939, + "step": 6910 + }, + { + "epoch": 0.58, + "grad_norm": 0.7547131776809692, + "learning_rate": 7.387465559866118e-05, + "loss": 0.9486, + "step": 6915 + }, + { + "epoch": 0.58, + "grad_norm": 0.9549170732498169, + "learning_rate": 7.374659594011519e-05, + "loss": 0.741, + "step": 6920 + }, + { + "epoch": 0.58, + "grad_norm": 0.7775087356567383, + "learning_rate": 7.361858250568805e-05, + "loss": 0.9265, + "step": 6925 + }, + { + "epoch": 0.59, + "grad_norm": 0.6937190294265747, + "learning_rate": 7.34906155207719e-05, + "loss": 1.0223, + "step": 6930 + }, + { + "epoch": 0.59, + "grad_norm": 0.803637683391571, + "learning_rate": 7.3362695210677e-05, + "loss": 0.8736, + "step": 6935 + }, + { + "epoch": 0.59, + "grad_norm": 0.7149904370307922, + "learning_rate": 7.32348218006313e-05, + "loss": 0.8665, + "step": 6940 + }, + { + "epoch": 0.59, + "grad_norm": 0.6032792925834656, + "learning_rate": 7.310699551578045e-05, + "loss": 0.842, + "step": 6945 + }, + { + "epoch": 0.59, + "grad_norm": 0.9613990783691406, + "learning_rate": 7.29792165811869e-05, + "loss": 0.8327, + "step": 6950 + }, + { + "epoch": 0.59, + "grad_norm": 0.8656706809997559, + "learning_rate": 7.285148522182975e-05, + "loss": 0.9984, + "step": 6955 + }, + { + "epoch": 0.59, + "grad_norm": 0.8221157789230347, + "learning_rate": 7.272380166260453e-05, + "loss": 0.8664, + "step": 6960 + }, + { + "epoch": 0.59, + "grad_norm": 0.6553401350975037, + "learning_rate": 7.259616612832237e-05, + "loss": 0.8861, + "step": 6965 + }, + { + "epoch": 0.59, + "grad_norm": 0.8584771156311035, + "learning_rate": 7.24685788437099e-05, + "loss": 0.9434, + "step": 6970 + }, + { + "epoch": 0.59, + "grad_norm": 0.6871647834777832, + "learning_rate": 7.234104003340898e-05, + "loss": 0.9134, + "step": 6975 + }, + { + "epoch": 0.59, + "grad_norm": 0.6781166195869446, + "learning_rate": 7.221354992197587e-05, + "loss": 0.8003, + "step": 6980 + }, + { + "epoch": 0.59, + "grad_norm": 0.7665917873382568, + "learning_rate": 7.208610873388122e-05, + "loss": 0.9455, + "step": 6985 + }, + { + "epoch": 0.59, + "grad_norm": 0.7046342492103577, + "learning_rate": 7.195871669350953e-05, + "loss": 0.9578, + "step": 6990 + }, + { + "epoch": 0.59, + "grad_norm": 0.709830105304718, + "learning_rate": 7.183137402515872e-05, + "loss": 0.8432, + "step": 6995 + }, + { + "epoch": 0.59, + "grad_norm": 0.8972019553184509, + "learning_rate": 7.170408095303992e-05, + "loss": 0.9268, + "step": 7000 + }, + { + "epoch": 0.59, + "grad_norm": 0.6620240211486816, + "learning_rate": 7.157683770127671e-05, + "loss": 0.8903, + "step": 7005 + }, + { + "epoch": 0.59, + "grad_norm": 0.6928107738494873, + "learning_rate": 7.14496444939051e-05, + "loss": 0.9334, + "step": 7010 + }, + { + "epoch": 0.59, + "grad_norm": 0.7520967125892639, + "learning_rate": 7.132250155487304e-05, + "loss": 0.8403, + "step": 7015 + }, + { + "epoch": 0.59, + "grad_norm": 0.7167927622795105, + "learning_rate": 7.119540910803982e-05, + "loss": 0.7793, + "step": 7020 + }, + { + "epoch": 0.59, + "grad_norm": 0.713470995426178, + "learning_rate": 7.106836737717589e-05, + "loss": 0.8981, + "step": 7025 + }, + { + "epoch": 0.59, + "grad_norm": 0.7497614026069641, + "learning_rate": 7.094137658596247e-05, + "loss": 0.8394, + "step": 7030 + }, + { + "epoch": 0.59, + "grad_norm": 0.6969075798988342, + "learning_rate": 7.081443695799102e-05, + "loss": 0.7917, + "step": 7035 + }, + { + "epoch": 0.59, + "grad_norm": 0.7626899480819702, + "learning_rate": 7.068754871676291e-05, + "loss": 0.9156, + "step": 7040 + }, + { + "epoch": 0.6, + "grad_norm": 0.6913230419158936, + "learning_rate": 7.056071208568911e-05, + "loss": 0.9365, + "step": 7045 + }, + { + "epoch": 0.6, + "grad_norm": 0.719185471534729, + "learning_rate": 7.043392728808962e-05, + "loss": 0.7711, + "step": 7050 + }, + { + "epoch": 0.6, + "grad_norm": 0.6523585319519043, + "learning_rate": 7.030719454719325e-05, + "loss": 0.7625, + "step": 7055 + }, + { + "epoch": 0.6, + "grad_norm": 0.7925283312797546, + "learning_rate": 7.018051408613715e-05, + "loss": 0.9242, + "step": 7060 + }, + { + "epoch": 0.6, + "grad_norm": 0.7551404237747192, + "learning_rate": 7.005388612796635e-05, + "loss": 0.8112, + "step": 7065 + }, + { + "epoch": 0.6, + "grad_norm": 0.7069532871246338, + "learning_rate": 6.992731089563356e-05, + "loss": 0.9597, + "step": 7070 + }, + { + "epoch": 0.6, + "grad_norm": 0.76814204454422, + "learning_rate": 6.980078861199854e-05, + "loss": 0.8544, + "step": 7075 + }, + { + "epoch": 0.6, + "grad_norm": 0.6964544057846069, + "learning_rate": 6.967431949982789e-05, + "loss": 0.9857, + "step": 7080 + }, + { + "epoch": 0.6, + "grad_norm": 0.7238839268684387, + "learning_rate": 6.954790378179459e-05, + "loss": 0.8608, + "step": 7085 + }, + { + "epoch": 0.6, + "grad_norm": 0.7430681586265564, + "learning_rate": 6.942154168047756e-05, + "loss": 0.8633, + "step": 7090 + }, + { + "epoch": 0.6, + "grad_norm": 0.7325025200843811, + "learning_rate": 6.929523341836133e-05, + "loss": 0.9935, + "step": 7095 + }, + { + "epoch": 0.6, + "grad_norm": 0.8399198055267334, + "learning_rate": 6.916897921783574e-05, + "loss": 0.7856, + "step": 7100 + }, + { + "epoch": 0.6, + "grad_norm": 0.7091554999351501, + "learning_rate": 6.904277930119529e-05, + "loss": 0.8994, + "step": 7105 + }, + { + "epoch": 0.6, + "grad_norm": 0.8715316653251648, + "learning_rate": 6.891663389063898e-05, + "loss": 0.8977, + "step": 7110 + }, + { + "epoch": 0.6, + "grad_norm": 0.7023261189460754, + "learning_rate": 6.879054320826988e-05, + "loss": 0.881, + "step": 7115 + }, + { + "epoch": 0.6, + "grad_norm": 0.6556528210639954, + "learning_rate": 6.866450747609461e-05, + "loss": 0.8441, + "step": 7120 + }, + { + "epoch": 0.6, + "grad_norm": 0.7134454250335693, + "learning_rate": 6.853852691602309e-05, + "loss": 0.8711, + "step": 7125 + }, + { + "epoch": 0.6, + "grad_norm": 0.7662861943244934, + "learning_rate": 6.841260174986811e-05, + "loss": 0.9114, + "step": 7130 + }, + { + "epoch": 0.6, + "grad_norm": 0.8190237283706665, + "learning_rate": 6.828673219934491e-05, + "loss": 0.9327, + "step": 7135 + }, + { + "epoch": 0.6, + "grad_norm": 0.7545244693756104, + "learning_rate": 6.816091848607081e-05, + "loss": 0.826, + "step": 7140 + }, + { + "epoch": 0.6, + "grad_norm": 0.8296646475791931, + "learning_rate": 6.80351608315648e-05, + "loss": 0.6987, + "step": 7145 + }, + { + "epoch": 0.6, + "grad_norm": 0.6238453984260559, + "learning_rate": 6.790945945724721e-05, + "loss": 0.9342, + "step": 7150 + }, + { + "epoch": 0.6, + "grad_norm": 0.7644151449203491, + "learning_rate": 6.778381458443925e-05, + "loss": 0.8402, + "step": 7155 + }, + { + "epoch": 0.6, + "grad_norm": 0.7168214321136475, + "learning_rate": 6.765822643436267e-05, + "loss": 0.8579, + "step": 7160 + }, + { + "epoch": 0.61, + "grad_norm": 0.6701621413230896, + "learning_rate": 6.753269522813929e-05, + "loss": 0.7196, + "step": 7165 + }, + { + "epoch": 0.61, + "grad_norm": 0.7104114890098572, + "learning_rate": 6.740722118679075e-05, + "loss": 0.7927, + "step": 7170 + }, + { + "epoch": 0.61, + "grad_norm": 0.7076807022094727, + "learning_rate": 6.728180453123798e-05, + "loss": 0.8925, + "step": 7175 + }, + { + "epoch": 0.61, + "grad_norm": 0.7029675841331482, + "learning_rate": 6.715644548230086e-05, + "loss": 0.8258, + "step": 7180 + }, + { + "epoch": 0.61, + "grad_norm": 0.7940952777862549, + "learning_rate": 6.703114426069797e-05, + "loss": 0.8566, + "step": 7185 + }, + { + "epoch": 0.61, + "grad_norm": 0.8184251189231873, + "learning_rate": 6.69059010870459e-05, + "loss": 0.878, + "step": 7190 + }, + { + "epoch": 0.61, + "grad_norm": 0.7338846921920776, + "learning_rate": 6.678071618185913e-05, + "loss": 0.7983, + "step": 7195 + }, + { + "epoch": 0.61, + "grad_norm": 0.6665210723876953, + "learning_rate": 6.665558976554957e-05, + "loss": 0.8469, + "step": 7200 + }, + { + "epoch": 0.61, + "grad_norm": 0.7321475744247437, + "learning_rate": 6.653052205842609e-05, + "loss": 0.8938, + "step": 7205 + }, + { + "epoch": 0.61, + "grad_norm": 0.6931696534156799, + "learning_rate": 6.640551328069414e-05, + "loss": 0.9283, + "step": 7210 + }, + { + "epoch": 0.61, + "grad_norm": 0.7098008990287781, + "learning_rate": 6.628056365245561e-05, + "loss": 0.8951, + "step": 7215 + }, + { + "epoch": 0.61, + "grad_norm": 0.8372482061386108, + "learning_rate": 6.615567339370803e-05, + "loss": 0.8581, + "step": 7220 + }, + { + "epoch": 0.61, + "grad_norm": 0.763494610786438, + "learning_rate": 6.603084272434455e-05, + "loss": 0.8382, + "step": 7225 + }, + { + "epoch": 0.61, + "grad_norm": 0.8865066766738892, + "learning_rate": 6.59060718641533e-05, + "loss": 0.978, + "step": 7230 + }, + { + "epoch": 0.61, + "grad_norm": 0.811994194984436, + "learning_rate": 6.578136103281717e-05, + "loss": 0.94, + "step": 7235 + }, + { + "epoch": 0.61, + "grad_norm": 0.8629786372184753, + "learning_rate": 6.565671044991335e-05, + "loss": 0.9999, + "step": 7240 + }, + { + "epoch": 0.61, + "grad_norm": 0.7018327713012695, + "learning_rate": 6.553212033491291e-05, + "loss": 0.8501, + "step": 7245 + }, + { + "epoch": 0.61, + "grad_norm": 0.6607480049133301, + "learning_rate": 6.540759090718047e-05, + "loss": 0.7967, + "step": 7250 + }, + { + "epoch": 0.61, + "grad_norm": 0.8655691146850586, + "learning_rate": 6.528312238597382e-05, + "loss": 0.8571, + "step": 7255 + }, + { + "epoch": 0.61, + "grad_norm": 0.705172598361969, + "learning_rate": 6.515871499044358e-05, + "loss": 0.7804, + "step": 7260 + }, + { + "epoch": 0.61, + "grad_norm": 0.7695091962814331, + "learning_rate": 6.50343689396325e-05, + "loss": 0.8561, + "step": 7265 + }, + { + "epoch": 0.61, + "grad_norm": 0.7393696904182434, + "learning_rate": 6.491008445247563e-05, + "loss": 0.8283, + "step": 7270 + }, + { + "epoch": 0.61, + "grad_norm": 0.7259432077407837, + "learning_rate": 6.478586174779947e-05, + "loss": 0.761, + "step": 7275 + }, + { + "epoch": 0.61, + "grad_norm": 0.8482978343963623, + "learning_rate": 6.466170104432166e-05, + "loss": 0.9335, + "step": 7280 + }, + { + "epoch": 0.62, + "grad_norm": 0.7087788581848145, + "learning_rate": 6.453760256065091e-05, + "loss": 0.9627, + "step": 7285 + }, + { + "epoch": 0.62, + "grad_norm": 0.857541024684906, + "learning_rate": 6.441356651528609e-05, + "loss": 0.9501, + "step": 7290 + }, + { + "epoch": 0.62, + "grad_norm": 0.803997278213501, + "learning_rate": 6.428959312661642e-05, + "loss": 0.8953, + "step": 7295 + }, + { + "epoch": 0.62, + "grad_norm": 0.7093678116798401, + "learning_rate": 6.416568261292062e-05, + "loss": 0.8817, + "step": 7300 + }, + { + "epoch": 0.62, + "grad_norm": 0.7989717125892639, + "learning_rate": 6.404183519236669e-05, + "loss": 0.8569, + "step": 7305 + }, + { + "epoch": 0.62, + "grad_norm": 0.6285958290100098, + "learning_rate": 6.391805108301167e-05, + "loss": 0.9482, + "step": 7310 + }, + { + "epoch": 0.62, + "grad_norm": 0.7950562238693237, + "learning_rate": 6.37943305028011e-05, + "loss": 0.933, + "step": 7315 + }, + { + "epoch": 0.62, + "grad_norm": 0.7158616781234741, + "learning_rate": 6.367067366956854e-05, + "loss": 0.96, + "step": 7320 + }, + { + "epoch": 0.62, + "grad_norm": 0.7947881817817688, + "learning_rate": 6.354708080103548e-05, + "loss": 1.0352, + "step": 7325 + }, + { + "epoch": 0.62, + "grad_norm": 0.7674684524536133, + "learning_rate": 6.342355211481065e-05, + "loss": 0.922, + "step": 7330 + }, + { + "epoch": 0.62, + "grad_norm": 0.7878246307373047, + "learning_rate": 6.33000878283898e-05, + "loss": 0.9961, + "step": 7335 + }, + { + "epoch": 0.62, + "grad_norm": 0.6970924735069275, + "learning_rate": 6.317668815915547e-05, + "loss": 0.8092, + "step": 7340 + }, + { + "epoch": 0.62, + "grad_norm": 0.7438830733299255, + "learning_rate": 6.305335332437617e-05, + "loss": 0.7657, + "step": 7345 + }, + { + "epoch": 0.62, + "grad_norm": 0.6238775253295898, + "learning_rate": 6.293008354120635e-05, + "loss": 0.9087, + "step": 7350 + }, + { + "epoch": 0.62, + "grad_norm": 0.7835497260093689, + "learning_rate": 6.280687902668604e-05, + "loss": 0.9097, + "step": 7355 + }, + { + "epoch": 0.62, + "grad_norm": 0.726040244102478, + "learning_rate": 6.26837399977402e-05, + "loss": 0.8469, + "step": 7360 + }, + { + "epoch": 0.62, + "grad_norm": 0.7236318588256836, + "learning_rate": 6.256066667117855e-05, + "loss": 0.8242, + "step": 7365 + }, + { + "epoch": 0.62, + "grad_norm": 0.7460520267486572, + "learning_rate": 6.243765926369513e-05, + "loss": 1.0041, + "step": 7370 + }, + { + "epoch": 0.62, + "grad_norm": 0.6667692065238953, + "learning_rate": 6.231471799186788e-05, + "loss": 0.8797, + "step": 7375 + }, + { + "epoch": 0.62, + "grad_norm": 0.7931992411613464, + "learning_rate": 6.219184307215843e-05, + "loss": 0.8165, + "step": 7380 + }, + { + "epoch": 0.62, + "grad_norm": 0.8295156955718994, + "learning_rate": 6.206903472091139e-05, + "loss": 0.9236, + "step": 7385 + }, + { + "epoch": 0.62, + "grad_norm": 0.6679932475090027, + "learning_rate": 6.194629315435426e-05, + "loss": 0.9222, + "step": 7390 + }, + { + "epoch": 0.62, + "grad_norm": 0.6346510052680969, + "learning_rate": 6.182361858859699e-05, + "loss": 0.9591, + "step": 7395 + }, + { + "epoch": 0.63, + "grad_norm": 0.7934569716453552, + "learning_rate": 6.170101123963152e-05, + "loss": 0.8165, + "step": 7400 + }, + { + "epoch": 0.63, + "grad_norm": 0.8610015511512756, + "learning_rate": 6.157847132333138e-05, + "loss": 0.934, + "step": 7405 + }, + { + "epoch": 0.63, + "grad_norm": 0.6814960837364197, + "learning_rate": 6.145599905545151e-05, + "loss": 0.9619, + "step": 7410 + }, + { + "epoch": 0.63, + "grad_norm": 0.7562384009361267, + "learning_rate": 6.133359465162767e-05, + "loss": 0.8535, + "step": 7415 + }, + { + "epoch": 0.63, + "grad_norm": 0.9794740080833435, + "learning_rate": 6.121125832737605e-05, + "loss": 0.8925, + "step": 7420 + }, + { + "epoch": 0.63, + "grad_norm": 0.6807798147201538, + "learning_rate": 6.108899029809313e-05, + "loss": 0.8478, + "step": 7425 + }, + { + "epoch": 0.63, + "grad_norm": 0.6749770641326904, + "learning_rate": 6.0966790779055036e-05, + "loss": 0.7792, + "step": 7430 + }, + { + "epoch": 0.63, + "grad_norm": 0.7428116202354431, + "learning_rate": 6.0844659985417285e-05, + "loss": 0.8334, + "step": 7435 + }, + { + "epoch": 0.63, + "grad_norm": 0.7202659845352173, + "learning_rate": 6.0722598132214445e-05, + "loss": 0.8558, + "step": 7440 + }, + { + "epoch": 0.63, + "grad_norm": 0.7060097455978394, + "learning_rate": 6.060060543435961e-05, + "loss": 0.9009, + "step": 7445 + }, + { + "epoch": 0.63, + "grad_norm": 0.658271849155426, + "learning_rate": 6.0478682106644225e-05, + "loss": 0.8167, + "step": 7450 + }, + { + "epoch": 0.63, + "grad_norm": 0.7464290857315063, + "learning_rate": 6.0356828363737484e-05, + "loss": 0.7015, + "step": 7455 + }, + { + "epoch": 0.63, + "grad_norm": 0.7930641174316406, + "learning_rate": 6.0235044420186125e-05, + "loss": 0.8402, + "step": 7460 + }, + { + "epoch": 0.63, + "grad_norm": 0.7233032584190369, + "learning_rate": 6.0113330490413985e-05, + "loss": 0.8989, + "step": 7465 + }, + { + "epoch": 0.63, + "grad_norm": 0.6508310437202454, + "learning_rate": 5.9991686788721646e-05, + "loss": 0.7937, + "step": 7470 + }, + { + "epoch": 0.63, + "grad_norm": 0.5942479372024536, + "learning_rate": 5.9870113529285956e-05, + "loss": 0.9154, + "step": 7475 + }, + { + "epoch": 0.63, + "grad_norm": 0.7045297622680664, + "learning_rate": 5.974861092615985e-05, + "loss": 0.8383, + "step": 7480 + }, + { + "epoch": 0.63, + "grad_norm": 0.7324946522712708, + "learning_rate": 5.96271791932718e-05, + "loss": 0.9954, + "step": 7485 + }, + { + "epoch": 0.63, + "grad_norm": 0.7979293465614319, + "learning_rate": 5.950581854442547e-05, + "loss": 0.8141, + "step": 7490 + }, + { + "epoch": 0.63, + "grad_norm": 0.7801807522773743, + "learning_rate": 5.9384529193299444e-05, + "loss": 0.9454, + "step": 7495 + }, + { + "epoch": 0.63, + "grad_norm": 0.7280653119087219, + "learning_rate": 5.926331135344671e-05, + "loss": 0.8436, + "step": 7500 + }, + { + "epoch": 0.63, + "grad_norm": 0.6937660574913025, + "learning_rate": 5.9142165238294344e-05, + "loss": 0.8859, + "step": 7505 + }, + { + "epoch": 0.63, + "grad_norm": 0.8108348846435547, + "learning_rate": 5.9021091061143194e-05, + "loss": 0.9237, + "step": 7510 + }, + { + "epoch": 0.63, + "grad_norm": 0.6278057098388672, + "learning_rate": 5.89000890351674e-05, + "loss": 0.9174, + "step": 7515 + }, + { + "epoch": 0.64, + "grad_norm": 0.8261899352073669, + "learning_rate": 5.877915937341407e-05, + "loss": 0.9808, + "step": 7520 + }, + { + "epoch": 0.64, + "grad_norm": 0.6049155592918396, + "learning_rate": 5.865830228880294e-05, + "loss": 0.859, + "step": 7525 + }, + { + "epoch": 0.64, + "grad_norm": 0.9032023549079895, + "learning_rate": 5.8537517994125876e-05, + "loss": 0.786, + "step": 7530 + }, + { + "epoch": 0.64, + "grad_norm": 0.7123483419418335, + "learning_rate": 5.84168067020467e-05, + "loss": 0.9726, + "step": 7535 + }, + { + "epoch": 0.64, + "grad_norm": 0.8480584621429443, + "learning_rate": 5.829616862510059e-05, + "loss": 0.8336, + "step": 7540 + }, + { + "epoch": 0.64, + "grad_norm": 0.730718731880188, + "learning_rate": 5.817560397569385e-05, + "loss": 0.8618, + "step": 7545 + }, + { + "epoch": 0.64, + "grad_norm": 0.6453015208244324, + "learning_rate": 5.805511296610362e-05, + "loss": 0.8167, + "step": 7550 + }, + { + "epoch": 0.64, + "grad_norm": 0.5834646821022034, + "learning_rate": 5.793469580847714e-05, + "loss": 0.8459, + "step": 7555 + }, + { + "epoch": 0.64, + "grad_norm": 0.7204859256744385, + "learning_rate": 5.7814352714831774e-05, + "loss": 0.9851, + "step": 7560 + }, + { + "epoch": 0.64, + "grad_norm": 0.7948461771011353, + "learning_rate": 5.769408389705453e-05, + "loss": 0.8953, + "step": 7565 + }, + { + "epoch": 0.64, + "grad_norm": 0.742241382598877, + "learning_rate": 5.757388956690155e-05, + "loss": 0.9261, + "step": 7570 + }, + { + "epoch": 0.64, + "grad_norm": 0.6826033592224121, + "learning_rate": 5.7453769935997825e-05, + "loss": 0.93, + "step": 7575 + }, + { + "epoch": 0.64, + "grad_norm": 0.7635707259178162, + "learning_rate": 5.733372521583686e-05, + "loss": 1.0025, + "step": 7580 + }, + { + "epoch": 0.64, + "grad_norm": 0.7816930413246155, + "learning_rate": 5.721375561778026e-05, + "loss": 0.8832, + "step": 7585 + }, + { + "epoch": 0.64, + "grad_norm": 0.6665587425231934, + "learning_rate": 5.70938613530573e-05, + "loss": 0.8808, + "step": 7590 + }, + { + "epoch": 0.64, + "grad_norm": 0.8372485041618347, + "learning_rate": 5.697404263276476e-05, + "loss": 0.8198, + "step": 7595 + }, + { + "epoch": 0.64, + "grad_norm": 0.7937222123146057, + "learning_rate": 5.685429966786628e-05, + "loss": 0.8799, + "step": 7600 + }, + { + "epoch": 0.64, + "grad_norm": 0.8681680560112, + "learning_rate": 5.673463266919216e-05, + "loss": 0.7796, + "step": 7605 + }, + { + "epoch": 0.64, + "grad_norm": 0.8229097723960876, + "learning_rate": 5.661504184743895e-05, + "loss": 0.9202, + "step": 7610 + }, + { + "epoch": 0.64, + "grad_norm": 0.710150957107544, + "learning_rate": 5.6495527413169026e-05, + "loss": 0.9788, + "step": 7615 + }, + { + "epoch": 0.64, + "grad_norm": 0.8372032046318054, + "learning_rate": 5.6376089576810396e-05, + "loss": 0.8809, + "step": 7620 + }, + { + "epoch": 0.64, + "grad_norm": 0.6738128066062927, + "learning_rate": 5.625672854865609e-05, + "loss": 0.8339, + "step": 7625 + }, + { + "epoch": 0.64, + "grad_norm": 0.8087076544761658, + "learning_rate": 5.613744453886394e-05, + "loss": 0.9022, + "step": 7630 + }, + { + "epoch": 0.64, + "grad_norm": 0.7368395328521729, + "learning_rate": 5.6018237757456163e-05, + "loss": 0.8785, + "step": 7635 + }, + { + "epoch": 0.65, + "grad_norm": 0.8094408512115479, + "learning_rate": 5.5899108414318994e-05, + "loss": 0.8625, + "step": 7640 + }, + { + "epoch": 0.65, + "grad_norm": 0.7902326583862305, + "learning_rate": 5.5780056719202304e-05, + "loss": 0.8728, + "step": 7645 + }, + { + "epoch": 0.65, + "grad_norm": 0.8153887987136841, + "learning_rate": 5.566108288171936e-05, + "loss": 0.8901, + "step": 7650 + }, + { + "epoch": 0.65, + "grad_norm": 0.7384223937988281, + "learning_rate": 5.5542187111346224e-05, + "loss": 0.844, + "step": 7655 + }, + { + "epoch": 0.65, + "grad_norm": 0.6871650218963623, + "learning_rate": 5.5423369617421564e-05, + "loss": 0.7374, + "step": 7660 + }, + { + "epoch": 0.65, + "grad_norm": 0.714224100112915, + "learning_rate": 5.530463060914619e-05, + "loss": 0.8958, + "step": 7665 + }, + { + "epoch": 0.65, + "grad_norm": 0.6976300477981567, + "learning_rate": 5.5185970295582726e-05, + "loss": 0.7616, + "step": 7670 + }, + { + "epoch": 0.65, + "grad_norm": 0.7711216807365417, + "learning_rate": 5.50673888856553e-05, + "loss": 0.8756, + "step": 7675 + }, + { + "epoch": 0.65, + "grad_norm": 0.6878015398979187, + "learning_rate": 5.494888658814907e-05, + "loss": 0.8061, + "step": 7680 + }, + { + "epoch": 0.65, + "grad_norm": 0.8739657998085022, + "learning_rate": 5.483046361170992e-05, + "loss": 0.9096, + "step": 7685 + }, + { + "epoch": 0.65, + "grad_norm": 0.7206617593765259, + "learning_rate": 5.471212016484399e-05, + "loss": 0.9847, + "step": 7690 + }, + { + "epoch": 0.65, + "grad_norm": 0.8055949211120605, + "learning_rate": 5.4593856455917536e-05, + "loss": 0.879, + "step": 7695 + }, + { + "epoch": 0.65, + "grad_norm": 0.816685676574707, + "learning_rate": 5.447567269315627e-05, + "loss": 0.9028, + "step": 7700 + }, + { + "epoch": 0.65, + "grad_norm": 0.7042024731636047, + "learning_rate": 5.435756908464529e-05, + "loss": 0.8437, + "step": 7705 + }, + { + "epoch": 0.65, + "grad_norm": 0.9031636118888855, + "learning_rate": 5.4239545838328475e-05, + "loss": 0.8619, + "step": 7710 + }, + { + "epoch": 0.65, + "grad_norm": 0.8082189559936523, + "learning_rate": 5.4121603162008226e-05, + "loss": 0.9048, + "step": 7715 + }, + { + "epoch": 0.65, + "grad_norm": 0.7807760238647461, + "learning_rate": 5.400374126334511e-05, + "loss": 0.8588, + "step": 7720 + }, + { + "epoch": 0.65, + "grad_norm": 0.6958518028259277, + "learning_rate": 5.388596034985742e-05, + "loss": 0.9614, + "step": 7725 + }, + { + "epoch": 0.65, + "grad_norm": 0.7661911845207214, + "learning_rate": 5.376826062892086e-05, + "loss": 0.8568, + "step": 7730 + }, + { + "epoch": 0.65, + "grad_norm": 0.773129403591156, + "learning_rate": 5.365064230776831e-05, + "loss": 0.8852, + "step": 7735 + }, + { + "epoch": 0.65, + "grad_norm": 0.7705147862434387, + "learning_rate": 5.3533105593489163e-05, + "loss": 0.9321, + "step": 7740 + }, + { + "epoch": 0.65, + "grad_norm": 0.6738101840019226, + "learning_rate": 5.3415650693029205e-05, + "loss": 0.7993, + "step": 7745 + }, + { + "epoch": 0.65, + "grad_norm": 0.9420228600502014, + "learning_rate": 5.329827781319018e-05, + "loss": 0.952, + "step": 7750 + }, + { + "epoch": 0.66, + "grad_norm": 0.7898505330085754, + "learning_rate": 5.318098716062934e-05, + "loss": 0.9361, + "step": 7755 + }, + { + "epoch": 0.66, + "grad_norm": 0.7509401440620422, + "learning_rate": 5.30637789418593e-05, + "loss": 0.908, + "step": 7760 + }, + { + "epoch": 0.66, + "grad_norm": 0.7303262948989868, + "learning_rate": 5.294665336324742e-05, + "loss": 0.9535, + "step": 7765 + }, + { + "epoch": 0.66, + "grad_norm": 0.755186140537262, + "learning_rate": 5.2829610631015606e-05, + "loss": 0.8986, + "step": 7770 + }, + { + "epoch": 0.66, + "grad_norm": 0.8424975872039795, + "learning_rate": 5.271265095123987e-05, + "loss": 0.9281, + "step": 7775 + }, + { + "epoch": 0.66, + "grad_norm": 0.7621006965637207, + "learning_rate": 5.2595774529850006e-05, + "loss": 0.8352, + "step": 7780 + }, + { + "epoch": 0.66, + "grad_norm": 0.7491013407707214, + "learning_rate": 5.24789815726292e-05, + "loss": 0.8101, + "step": 7785 + }, + { + "epoch": 0.66, + "grad_norm": 0.8187841773033142, + "learning_rate": 5.2362272285213756e-05, + "loss": 0.8814, + "step": 7790 + }, + { + "epoch": 0.66, + "grad_norm": 0.7539845705032349, + "learning_rate": 5.224564687309261e-05, + "loss": 1.0235, + "step": 7795 + }, + { + "epoch": 0.66, + "grad_norm": 0.7542448043823242, + "learning_rate": 5.2129105541606916e-05, + "loss": 0.9453, + "step": 7800 + }, + { + "epoch": 0.66, + "grad_norm": 0.7108261585235596, + "learning_rate": 5.2012648495949976e-05, + "loss": 0.8537, + "step": 7805 + }, + { + "epoch": 0.66, + "grad_norm": 0.8113353848457336, + "learning_rate": 5.189627594116657e-05, + "loss": 0.8769, + "step": 7810 + }, + { + "epoch": 0.66, + "grad_norm": 0.8466603755950928, + "learning_rate": 5.1779988082152786e-05, + "loss": 0.899, + "step": 7815 + }, + { + "epoch": 0.66, + "grad_norm": 0.7004279494285583, + "learning_rate": 5.166378512365552e-05, + "loss": 0.8884, + "step": 7820 + }, + { + "epoch": 0.66, + "grad_norm": 0.7048774361610413, + "learning_rate": 5.1547667270272226e-05, + "loss": 0.9008, + "step": 7825 + }, + { + "epoch": 0.66, + "grad_norm": 0.7400707602500916, + "learning_rate": 5.143163472645049e-05, + "loss": 0.8443, + "step": 7830 + }, + { + "epoch": 0.66, + "grad_norm": 0.8226672410964966, + "learning_rate": 5.131568769648775e-05, + "loss": 0.7586, + "step": 7835 + }, + { + "epoch": 0.66, + "grad_norm": 0.7571192979812622, + "learning_rate": 5.119982638453075e-05, + "loss": 0.8673, + "step": 7840 + }, + { + "epoch": 0.66, + "grad_norm": 0.58736652135849, + "learning_rate": 5.108405099457549e-05, + "loss": 0.7977, + "step": 7845 + }, + { + "epoch": 0.66, + "grad_norm": 0.7114312052726746, + "learning_rate": 5.096836173046663e-05, + "loss": 0.9531, + "step": 7850 + }, + { + "epoch": 0.66, + "grad_norm": 0.9230597615242004, + "learning_rate": 5.0852758795897006e-05, + "loss": 0.8671, + "step": 7855 + }, + { + "epoch": 0.66, + "grad_norm": 0.6817287802696228, + "learning_rate": 5.073724239440773e-05, + "loss": 0.8198, + "step": 7860 + }, + { + "epoch": 0.66, + "grad_norm": 0.7963387370109558, + "learning_rate": 5.06218127293874e-05, + "loss": 0.9242, + "step": 7865 + }, + { + "epoch": 0.66, + "grad_norm": 0.7177277207374573, + "learning_rate": 5.050647000407189e-05, + "loss": 0.8446, + "step": 7870 + }, + { + "epoch": 0.67, + "grad_norm": 0.7984820008277893, + "learning_rate": 5.039121442154415e-05, + "loss": 0.9517, + "step": 7875 + }, + { + "epoch": 0.67, + "grad_norm": 0.8220101594924927, + "learning_rate": 5.027604618473347e-05, + "loss": 0.8199, + "step": 7880 + }, + { + "epoch": 0.67, + "grad_norm": 0.6571751236915588, + "learning_rate": 5.016096549641549e-05, + "loss": 0.7767, + "step": 7885 + }, + { + "epoch": 0.67, + "grad_norm": 0.7824209332466125, + "learning_rate": 5.004597255921174e-05, + "loss": 0.962, + "step": 7890 + }, + { + "epoch": 0.67, + "grad_norm": 0.723919689655304, + "learning_rate": 4.993106757558912e-05, + "loss": 0.931, + "step": 7895 + }, + { + "epoch": 0.67, + "grad_norm": 0.6097091436386108, + "learning_rate": 4.981625074785986e-05, + "loss": 0.8089, + "step": 7900 + }, + { + "epoch": 0.67, + "grad_norm": 0.8885555863380432, + "learning_rate": 4.9701522278180736e-05, + "loss": 0.941, + "step": 7905 + }, + { + "epoch": 0.67, + "grad_norm": 1.211531162261963, + "learning_rate": 4.958688236855308e-05, + "loss": 0.8999, + "step": 7910 + }, + { + "epoch": 0.67, + "grad_norm": 0.9506832957267761, + "learning_rate": 4.9472331220822366e-05, + "loss": 1.0478, + "step": 7915 + }, + { + "epoch": 0.67, + "grad_norm": 0.6763548254966736, + "learning_rate": 4.935786903667767e-05, + "loss": 0.8329, + "step": 7920 + }, + { + "epoch": 0.67, + "grad_norm": 0.8154675960540771, + "learning_rate": 4.9243496017651434e-05, + "loss": 0.8818, + "step": 7925 + }, + { + "epoch": 0.67, + "grad_norm": 0.8322145938873291, + "learning_rate": 4.912921236511927e-05, + "loss": 0.8129, + "step": 7930 + }, + { + "epoch": 0.67, + "grad_norm": 0.7868553996086121, + "learning_rate": 4.901501828029919e-05, + "loss": 0.9957, + "step": 7935 + }, + { + "epoch": 0.67, + "grad_norm": 0.7912847399711609, + "learning_rate": 4.890091396425163e-05, + "loss": 0.8805, + "step": 7940 + }, + { + "epoch": 0.67, + "grad_norm": 0.7965796589851379, + "learning_rate": 4.878689961787907e-05, + "loss": 0.9009, + "step": 7945 + }, + { + "epoch": 0.67, + "grad_norm": 0.6940076351165771, + "learning_rate": 4.8672975441925425e-05, + "loss": 0.9712, + "step": 7950 + }, + { + "epoch": 0.67, + "grad_norm": 0.8586145043373108, + "learning_rate": 4.8559141636975925e-05, + "loss": 0.7787, + "step": 7955 + }, + { + "epoch": 0.67, + "grad_norm": 0.6977418661117554, + "learning_rate": 4.844539840345666e-05, + "loss": 0.9263, + "step": 7960 + }, + { + "epoch": 0.67, + "grad_norm": 0.7754257321357727, + "learning_rate": 4.8331745941634235e-05, + "loss": 0.8932, + "step": 7965 + }, + { + "epoch": 0.67, + "grad_norm": 0.7636638283729553, + "learning_rate": 4.821818445161551e-05, + "loss": 0.9223, + "step": 7970 + }, + { + "epoch": 0.67, + "grad_norm": 0.7074651718139648, + "learning_rate": 4.810471413334711e-05, + "loss": 0.9669, + "step": 7975 + }, + { + "epoch": 0.67, + "grad_norm": 0.7296220064163208, + "learning_rate": 4.7991335186615126e-05, + "loss": 0.912, + "step": 7980 + }, + { + "epoch": 0.67, + "grad_norm": 0.9196190237998962, + "learning_rate": 4.78780478110448e-05, + "loss": 0.9151, + "step": 7985 + }, + { + "epoch": 0.67, + "grad_norm": 0.638798713684082, + "learning_rate": 4.776485220610014e-05, + "loss": 0.8709, + "step": 7990 + }, + { + "epoch": 0.68, + "grad_norm": 0.8261780142784119, + "learning_rate": 4.765174857108352e-05, + "loss": 0.892, + "step": 7995 + }, + { + "epoch": 0.68, + "grad_norm": 0.6537724733352661, + "learning_rate": 4.7538737105135526e-05, + "loss": 0.8884, + "step": 8000 + }, + { + "epoch": 0.68, + "grad_norm": 0.6304785013198853, + "learning_rate": 4.7425818007234324e-05, + "loss": 0.8929, + "step": 8005 + }, + { + "epoch": 0.68, + "grad_norm": 0.9420642852783203, + "learning_rate": 4.73129914761955e-05, + "loss": 0.7801, + "step": 8010 + }, + { + "epoch": 0.68, + "grad_norm": 0.6773472428321838, + "learning_rate": 4.720025771067166e-05, + "loss": 0.7125, + "step": 8015 + }, + { + "epoch": 0.68, + "grad_norm": 0.6679495573043823, + "learning_rate": 4.708761690915206e-05, + "loss": 0.9065, + "step": 8020 + }, + { + "epoch": 0.68, + "grad_norm": 0.7421685457229614, + "learning_rate": 4.697506926996226e-05, + "loss": 0.9077, + "step": 8025 + }, + { + "epoch": 0.68, + "grad_norm": 0.7675222158432007, + "learning_rate": 4.686261499126389e-05, + "loss": 0.9164, + "step": 8030 + }, + { + "epoch": 0.68, + "grad_norm": 0.6961898803710938, + "learning_rate": 4.6750254271054087e-05, + "loss": 0.7967, + "step": 8035 + }, + { + "epoch": 0.68, + "grad_norm": 0.9274904727935791, + "learning_rate": 4.663798730716532e-05, + "loss": 0.9743, + "step": 8040 + }, + { + "epoch": 0.68, + "grad_norm": 0.760764479637146, + "learning_rate": 4.6525814297264945e-05, + "loss": 0.9513, + "step": 8045 + }, + { + "epoch": 0.68, + "grad_norm": 0.7536008358001709, + "learning_rate": 4.641373543885489e-05, + "loss": 0.8118, + "step": 8050 + }, + { + "epoch": 0.68, + "grad_norm": 0.8215941190719604, + "learning_rate": 4.6301750929271404e-05, + "loss": 0.8853, + "step": 8055 + }, + { + "epoch": 0.68, + "grad_norm": 0.7154656648635864, + "learning_rate": 4.61898609656845e-05, + "loss": 0.845, + "step": 8060 + }, + { + "epoch": 0.68, + "grad_norm": 0.8202133774757385, + "learning_rate": 4.607806574509781e-05, + "loss": 0.8672, + "step": 8065 + }, + { + "epoch": 0.68, + "grad_norm": 0.7498393654823303, + "learning_rate": 4.596636546434807e-05, + "loss": 0.9539, + "step": 8070 + }, + { + "epoch": 0.68, + "grad_norm": 0.775325357913971, + "learning_rate": 4.585476032010494e-05, + "loss": 0.9314, + "step": 8075 + }, + { + "epoch": 0.68, + "grad_norm": 0.9472012519836426, + "learning_rate": 4.5743250508870475e-05, + "loss": 0.9524, + "step": 8080 + }, + { + "epoch": 0.68, + "grad_norm": 0.8268347978591919, + "learning_rate": 4.5631836226979017e-05, + "loss": 0.9405, + "step": 8085 + }, + { + "epoch": 0.68, + "grad_norm": 0.7666311860084534, + "learning_rate": 4.5520517670596607e-05, + "loss": 0.8873, + "step": 8090 + }, + { + "epoch": 0.68, + "grad_norm": 0.783304750919342, + "learning_rate": 4.540929503572077e-05, + "loss": 0.8744, + "step": 8095 + }, + { + "epoch": 0.68, + "grad_norm": 0.6756754517555237, + "learning_rate": 4.5298168518180115e-05, + "loss": 0.8438, + "step": 8100 + }, + { + "epoch": 0.68, + "grad_norm": 0.6557435989379883, + "learning_rate": 4.518713831363408e-05, + "loss": 0.8878, + "step": 8105 + }, + { + "epoch": 0.69, + "grad_norm": 0.9128133654594421, + "learning_rate": 4.5076204617572425e-05, + "loss": 0.911, + "step": 8110 + }, + { + "epoch": 0.69, + "grad_norm": 0.7338535189628601, + "learning_rate": 4.4965367625315146e-05, + "loss": 0.8907, + "step": 8115 + }, + { + "epoch": 0.69, + "grad_norm": 0.901493489742279, + "learning_rate": 4.4854627532011836e-05, + "loss": 0.9086, + "step": 8120 + }, + { + "epoch": 0.69, + "grad_norm": 0.834770679473877, + "learning_rate": 4.474398453264154e-05, + "loss": 0.8569, + "step": 8125 + }, + { + "epoch": 0.69, + "grad_norm": 0.773137092590332, + "learning_rate": 4.463343882201231e-05, + "loss": 0.9045, + "step": 8130 + }, + { + "epoch": 0.69, + "grad_norm": 0.893915593624115, + "learning_rate": 4.452299059476091e-05, + "loss": 0.9175, + "step": 8135 + }, + { + "epoch": 0.69, + "grad_norm": 0.7731142044067383, + "learning_rate": 4.441264004535254e-05, + "loss": 0.9611, + "step": 8140 + }, + { + "epoch": 0.69, + "grad_norm": 0.8787577748298645, + "learning_rate": 4.430238736808033e-05, + "loss": 0.9501, + "step": 8145 + }, + { + "epoch": 0.69, + "grad_norm": 0.7855997681617737, + "learning_rate": 4.419223275706515e-05, + "loss": 0.8213, + "step": 8150 + }, + { + "epoch": 0.69, + "grad_norm": 0.7477709650993347, + "learning_rate": 4.408217640625514e-05, + "loss": 0.808, + "step": 8155 + }, + { + "epoch": 0.69, + "grad_norm": 0.884198784828186, + "learning_rate": 4.397221850942549e-05, + "loss": 0.8188, + "step": 8160 + }, + { + "epoch": 0.69, + "grad_norm": 0.755005955696106, + "learning_rate": 4.386235926017798e-05, + "loss": 0.9438, + "step": 8165 + }, + { + "epoch": 0.69, + "grad_norm": 0.7178514003753662, + "learning_rate": 4.3752598851940805e-05, + "loss": 1.0384, + "step": 8170 + }, + { + "epoch": 0.69, + "grad_norm": 0.6809802651405334, + "learning_rate": 4.3642937477968105e-05, + "loss": 0.8595, + "step": 8175 + }, + { + "epoch": 0.69, + "grad_norm": 0.6554874777793884, + "learning_rate": 4.3533375331339486e-05, + "loss": 0.8091, + "step": 8180 + }, + { + "epoch": 0.69, + "grad_norm": 0.6444360017776489, + "learning_rate": 4.3423912604960095e-05, + "loss": 0.7865, + "step": 8185 + }, + { + "epoch": 0.69, + "grad_norm": 0.7334080934524536, + "learning_rate": 4.331454949155983e-05, + "loss": 0.9078, + "step": 8190 + }, + { + "epoch": 0.69, + "grad_norm": 0.7476295232772827, + "learning_rate": 4.320528618369337e-05, + "loss": 0.8777, + "step": 8195 + }, + { + "epoch": 0.69, + "grad_norm": 0.7522503733634949, + "learning_rate": 4.309612287373957e-05, + "loss": 0.9077, + "step": 8200 + }, + { + "epoch": 0.69, + "grad_norm": 0.7205644845962524, + "learning_rate": 4.298705975390115e-05, + "loss": 0.8997, + "step": 8205 + }, + { + "epoch": 0.69, + "grad_norm": 0.6863880753517151, + "learning_rate": 4.287809701620459e-05, + "loss": 0.8168, + "step": 8210 + }, + { + "epoch": 0.69, + "grad_norm": 0.7311332821846008, + "learning_rate": 4.2769234852499505e-05, + "loss": 0.9752, + "step": 8215 + }, + { + "epoch": 0.69, + "grad_norm": 0.8853205442428589, + "learning_rate": 4.266047345445846e-05, + "loss": 0.8139, + "step": 8220 + }, + { + "epoch": 0.69, + "grad_norm": 0.6810586452484131, + "learning_rate": 4.255181301357668e-05, + "loss": 0.8455, + "step": 8225 + }, + { + "epoch": 0.7, + "grad_norm": 0.8352945446968079, + "learning_rate": 4.244325372117156e-05, + "loss": 0.8777, + "step": 8230 + }, + { + "epoch": 0.7, + "grad_norm": 0.6936649680137634, + "learning_rate": 4.2334795768382306e-05, + "loss": 0.7469, + "step": 8235 + }, + { + "epoch": 0.7, + "grad_norm": 0.8308777213096619, + "learning_rate": 4.2226439346169924e-05, + "loss": 0.8074, + "step": 8240 + }, + { + "epoch": 0.7, + "grad_norm": 0.7798436880111694, + "learning_rate": 4.211818464531649e-05, + "loss": 0.892, + "step": 8245 + }, + { + "epoch": 0.7, + "grad_norm": 0.8142618536949158, + "learning_rate": 4.2010031856425e-05, + "loss": 0.8976, + "step": 8250 + }, + { + "epoch": 0.7, + "grad_norm": 0.8766167759895325, + "learning_rate": 4.190198116991915e-05, + "loss": 0.9761, + "step": 8255 + }, + { + "epoch": 0.7, + "grad_norm": 0.7710538506507874, + "learning_rate": 4.179403277604259e-05, + "loss": 0.9142, + "step": 8260 + }, + { + "epoch": 0.7, + "grad_norm": 0.800112247467041, + "learning_rate": 4.168618686485916e-05, + "loss": 0.8007, + "step": 8265 + }, + { + "epoch": 0.7, + "grad_norm": 0.7750797271728516, + "learning_rate": 4.1578443626252094e-05, + "loss": 0.7865, + "step": 8270 + }, + { + "epoch": 0.7, + "grad_norm": 0.6965195536613464, + "learning_rate": 4.147080324992384e-05, + "loss": 0.8434, + "step": 8275 + }, + { + "epoch": 0.7, + "grad_norm": 0.8158455491065979, + "learning_rate": 4.136326592539591e-05, + "loss": 0.7183, + "step": 8280 + }, + { + "epoch": 0.7, + "grad_norm": 0.7494208812713623, + "learning_rate": 4.125583184200812e-05, + "loss": 0.8158, + "step": 8285 + }, + { + "epoch": 0.7, + "grad_norm": 0.8211252093315125, + "learning_rate": 4.114850118891866e-05, + "loss": 0.9652, + "step": 8290 + }, + { + "epoch": 0.7, + "grad_norm": 0.7634385824203491, + "learning_rate": 4.104127415510365e-05, + "loss": 0.9349, + "step": 8295 + }, + { + "epoch": 0.7, + "grad_norm": 0.7225372195243835, + "learning_rate": 4.093415092935667e-05, + "loss": 0.8758, + "step": 8300 + }, + { + "epoch": 0.7, + "grad_norm": 0.7113853096961975, + "learning_rate": 4.082713170028858e-05, + "loss": 0.8378, + "step": 8305 + }, + { + "epoch": 0.7, + "grad_norm": 0.7211659550666809, + "learning_rate": 4.0720216656327105e-05, + "loss": 0.9371, + "step": 8310 + }, + { + "epoch": 0.7, + "grad_norm": 0.763894259929657, + "learning_rate": 4.0613405985716554e-05, + "loss": 0.8955, + "step": 8315 + }, + { + "epoch": 0.7, + "grad_norm": 0.7050108909606934, + "learning_rate": 4.050669987651742e-05, + "loss": 0.9218, + "step": 8320 + }, + { + "epoch": 0.7, + "grad_norm": 0.7349695563316345, + "learning_rate": 4.04000985166062e-05, + "loss": 0.8825, + "step": 8325 + }, + { + "epoch": 0.7, + "grad_norm": 0.7132920622825623, + "learning_rate": 4.029360209367487e-05, + "loss": 0.8753, + "step": 8330 + }, + { + "epoch": 0.7, + "grad_norm": 0.6986986994743347, + "learning_rate": 4.0187210795230677e-05, + "loss": 0.8414, + "step": 8335 + }, + { + "epoch": 0.7, + "grad_norm": 0.8510329127311707, + "learning_rate": 4.008092480859574e-05, + "loss": 0.8665, + "step": 8340 + }, + { + "epoch": 0.7, + "grad_norm": 0.8111036419868469, + "learning_rate": 3.997474432090679e-05, + "loss": 0.8516, + "step": 8345 + }, + { + "epoch": 0.71, + "grad_norm": 0.7777669429779053, + "learning_rate": 3.986866951911483e-05, + "loss": 0.8469, + "step": 8350 + }, + { + "epoch": 0.71, + "grad_norm": 0.8348636627197266, + "learning_rate": 3.9762700589984744e-05, + "loss": 0.9188, + "step": 8355 + }, + { + "epoch": 0.71, + "grad_norm": 0.8833033442497253, + "learning_rate": 3.965683772009502e-05, + "loss": 0.956, + "step": 8360 + }, + { + "epoch": 0.71, + "grad_norm": 0.7732721567153931, + "learning_rate": 3.95510810958374e-05, + "loss": 0.9024, + "step": 8365 + }, + { + "epoch": 0.71, + "grad_norm": 0.7568506598472595, + "learning_rate": 3.944543090341656e-05, + "loss": 0.9141, + "step": 8370 + }, + { + "epoch": 0.71, + "grad_norm": 0.7183312773704529, + "learning_rate": 3.933988732884976e-05, + "loss": 0.8308, + "step": 8375 + }, + { + "epoch": 0.71, + "grad_norm": 0.8897679448127747, + "learning_rate": 3.923445055796664e-05, + "loss": 0.9499, + "step": 8380 + }, + { + "epoch": 0.71, + "grad_norm": 0.8179068565368652, + "learning_rate": 3.912912077640869e-05, + "loss": 1.0211, + "step": 8385 + }, + { + "epoch": 0.71, + "grad_norm": 0.9936897158622742, + "learning_rate": 3.9023898169629046e-05, + "loss": 0.9061, + "step": 8390 + }, + { + "epoch": 0.71, + "grad_norm": 0.6856409907341003, + "learning_rate": 3.891878292289216e-05, + "loss": 0.9331, + "step": 8395 + }, + { + "epoch": 0.71, + "grad_norm": 0.7015929222106934, + "learning_rate": 3.881377522127343e-05, + "loss": 0.7606, + "step": 8400 + }, + { + "epoch": 0.71, + "grad_norm": 0.8607639074325562, + "learning_rate": 3.8708875249658905e-05, + "loss": 0.918, + "step": 8405 + }, + { + "epoch": 0.71, + "grad_norm": 1.0351639986038208, + "learning_rate": 3.8604083192745036e-05, + "loss": 0.8172, + "step": 8410 + }, + { + "epoch": 0.71, + "grad_norm": 0.8130463361740112, + "learning_rate": 3.849939923503815e-05, + "loss": 0.9112, + "step": 8415 + }, + { + "epoch": 0.71, + "grad_norm": 0.7744996547698975, + "learning_rate": 3.83948235608543e-05, + "loss": 0.9202, + "step": 8420 + }, + { + "epoch": 0.71, + "grad_norm": 0.8210654258728027, + "learning_rate": 3.829035635431889e-05, + "loss": 0.8436, + "step": 8425 + }, + { + "epoch": 0.71, + "grad_norm": 0.7811559438705444, + "learning_rate": 3.818599779936629e-05, + "loss": 0.9175, + "step": 8430 + }, + { + "epoch": 0.71, + "grad_norm": 0.7337599992752075, + "learning_rate": 3.80817480797397e-05, + "loss": 0.8021, + "step": 8435 + }, + { + "epoch": 0.71, + "grad_norm": 0.7291502356529236, + "learning_rate": 3.7977607378990574e-05, + "loss": 0.8785, + "step": 8440 + }, + { + "epoch": 0.71, + "grad_norm": 0.6997067928314209, + "learning_rate": 3.787357588047844e-05, + "loss": 0.7704, + "step": 8445 + }, + { + "epoch": 0.71, + "grad_norm": 0.8448328375816345, + "learning_rate": 3.7769653767370586e-05, + "loss": 0.9154, + "step": 8450 + }, + { + "epoch": 0.71, + "grad_norm": 0.8886889815330505, + "learning_rate": 3.766584122264166e-05, + "loss": 0.9185, + "step": 8455 + }, + { + "epoch": 0.71, + "grad_norm": 0.7532009482383728, + "learning_rate": 3.7562138429073424e-05, + "loss": 0.795, + "step": 8460 + }, + { + "epoch": 0.72, + "grad_norm": 0.7099334001541138, + "learning_rate": 3.7458545569254445e-05, + "loss": 0.84, + "step": 8465 + }, + { + "epoch": 0.72, + "grad_norm": 0.6124157905578613, + "learning_rate": 3.735506282557967e-05, + "loss": 0.7536, + "step": 8470 + }, + { + "epoch": 0.72, + "grad_norm": 0.7885011434555054, + "learning_rate": 3.725169038025016e-05, + "loss": 0.8933, + "step": 8475 + }, + { + "epoch": 0.72, + "grad_norm": 0.7972614765167236, + "learning_rate": 3.714842841527282e-05, + "loss": 0.8322, + "step": 8480 + }, + { + "epoch": 0.72, + "grad_norm": 0.7810702323913574, + "learning_rate": 3.7045277112459954e-05, + "loss": 0.8997, + "step": 8485 + }, + { + "epoch": 0.72, + "grad_norm": 0.7650167942047119, + "learning_rate": 3.694223665342915e-05, + "loss": 0.8807, + "step": 8490 + }, + { + "epoch": 0.72, + "grad_norm": 0.8026538491249084, + "learning_rate": 3.683930721960276e-05, + "loss": 0.8664, + "step": 8495 + }, + { + "epoch": 0.72, + "grad_norm": 0.9043586254119873, + "learning_rate": 3.6736488992207615e-05, + "loss": 0.8305, + "step": 8500 + }, + { + "epoch": 0.72, + "grad_norm": 0.6823323965072632, + "learning_rate": 3.663378215227483e-05, + "loss": 0.7567, + "step": 8505 + }, + { + "epoch": 0.72, + "grad_norm": 1.2210302352905273, + "learning_rate": 3.653118688063935e-05, + "loss": 0.9452, + "step": 8510 + }, + { + "epoch": 0.72, + "grad_norm": 0.7662010192871094, + "learning_rate": 3.6428703357939644e-05, + "loss": 0.9422, + "step": 8515 + }, + { + "epoch": 0.72, + "grad_norm": 0.7560374736785889, + "learning_rate": 3.632633176461755e-05, + "loss": 0.8106, + "step": 8520 + }, + { + "epoch": 0.72, + "grad_norm": 1.0334359407424927, + "learning_rate": 3.622407228091774e-05, + "loss": 0.93, + "step": 8525 + }, + { + "epoch": 0.72, + "grad_norm": 0.7211924195289612, + "learning_rate": 3.612192508688751e-05, + "loss": 0.8359, + "step": 8530 + }, + { + "epoch": 0.72, + "grad_norm": 0.7545579075813293, + "learning_rate": 3.601989036237644e-05, + "loss": 0.9147, + "step": 8535 + }, + { + "epoch": 0.72, + "grad_norm": 0.8421271443367004, + "learning_rate": 3.5917968287036104e-05, + "loss": 0.7863, + "step": 8540 + }, + { + "epoch": 0.72, + "grad_norm": 0.8393056392669678, + "learning_rate": 3.5816159040319716e-05, + "loss": 0.9036, + "step": 8545 + }, + { + "epoch": 0.72, + "grad_norm": 0.9179325699806213, + "learning_rate": 3.5714462801481895e-05, + "loss": 0.8731, + "step": 8550 + }, + { + "epoch": 0.72, + "grad_norm": 0.6770288348197937, + "learning_rate": 3.5612879749578244e-05, + "loss": 0.7885, + "step": 8555 + }, + { + "epoch": 0.72, + "grad_norm": 0.7247023582458496, + "learning_rate": 3.551141006346499e-05, + "loss": 0.9884, + "step": 8560 + }, + { + "epoch": 0.72, + "grad_norm": 0.8267160058021545, + "learning_rate": 3.5410053921798926e-05, + "loss": 0.7846, + "step": 8565 + }, + { + "epoch": 0.72, + "grad_norm": 0.8529921770095825, + "learning_rate": 3.530881150303679e-05, + "loss": 0.844, + "step": 8570 + }, + { + "epoch": 0.72, + "grad_norm": 0.7341428995132446, + "learning_rate": 3.5207682985435206e-05, + "loss": 0.912, + "step": 8575 + }, + { + "epoch": 0.72, + "grad_norm": 0.6659154891967773, + "learning_rate": 3.510666854705021e-05, + "loss": 0.9415, + "step": 8580 + }, + { + "epoch": 0.73, + "grad_norm": 0.7431322932243347, + "learning_rate": 3.5005768365736855e-05, + "loss": 0.8272, + "step": 8585 + }, + { + "epoch": 0.73, + "grad_norm": 0.9032961130142212, + "learning_rate": 3.490498261914923e-05, + "loss": 0.8756, + "step": 8590 + }, + { + "epoch": 0.73, + "grad_norm": 0.8175019025802612, + "learning_rate": 3.48043114847398e-05, + "loss": 0.9391, + "step": 8595 + }, + { + "epoch": 0.73, + "grad_norm": 0.667407751083374, + "learning_rate": 3.470375513975925e-05, + "loss": 0.8181, + "step": 8600 + }, + { + "epoch": 0.73, + "grad_norm": 0.7949620485305786, + "learning_rate": 3.460331376125624e-05, + "loss": 0.8459, + "step": 8605 + }, + { + "epoch": 0.73, + "grad_norm": 0.8054453134536743, + "learning_rate": 3.450298752607696e-05, + "loss": 0.9848, + "step": 8610 + }, + { + "epoch": 0.73, + "grad_norm": 0.7551806569099426, + "learning_rate": 3.440277661086475e-05, + "loss": 0.8592, + "step": 8615 + }, + { + "epoch": 0.73, + "grad_norm": 0.8027676343917847, + "learning_rate": 3.4302681192060114e-05, + "loss": 0.9916, + "step": 8620 + }, + { + "epoch": 0.73, + "grad_norm": 1.0340180397033691, + "learning_rate": 3.4202701445900085e-05, + "loss": 0.8464, + "step": 8625 + }, + { + "epoch": 0.73, + "grad_norm": 0.6809491515159607, + "learning_rate": 3.410283754841801e-05, + "loss": 0.8435, + "step": 8630 + }, + { + "epoch": 0.73, + "grad_norm": 0.7135958075523376, + "learning_rate": 3.40030896754434e-05, + "loss": 0.8993, + "step": 8635 + }, + { + "epoch": 0.73, + "grad_norm": 0.7542569041252136, + "learning_rate": 3.390345800260125e-05, + "loss": 0.8111, + "step": 8640 + }, + { + "epoch": 0.73, + "grad_norm": 0.9424663186073303, + "learning_rate": 3.380394270531221e-05, + "loss": 0.8995, + "step": 8645 + }, + { + "epoch": 0.73, + "grad_norm": 0.8005475401878357, + "learning_rate": 3.370454395879188e-05, + "loss": 0.8029, + "step": 8650 + }, + { + "epoch": 0.73, + "grad_norm": 0.6959512233734131, + "learning_rate": 3.360526193805065e-05, + "loss": 0.9035, + "step": 8655 + }, + { + "epoch": 0.73, + "grad_norm": 0.7621459364891052, + "learning_rate": 3.3506096817893526e-05, + "loss": 0.9833, + "step": 8660 + }, + { + "epoch": 0.73, + "grad_norm": 0.8192619681358337, + "learning_rate": 3.3407048772919514e-05, + "loss": 0.9324, + "step": 8665 + }, + { + "epoch": 0.73, + "grad_norm": 0.8644863963127136, + "learning_rate": 3.3308117977521544e-05, + "loss": 0.8136, + "step": 8670 + }, + { + "epoch": 0.73, + "grad_norm": 0.6707440614700317, + "learning_rate": 3.32093046058862e-05, + "loss": 0.845, + "step": 8675 + }, + { + "epoch": 0.73, + "grad_norm": 0.6646841764450073, + "learning_rate": 3.311060883199323e-05, + "loss": 0.907, + "step": 8680 + }, + { + "epoch": 0.73, + "grad_norm": 0.8453482985496521, + "learning_rate": 3.301203082961532e-05, + "loss": 0.8255, + "step": 8685 + }, + { + "epoch": 0.73, + "grad_norm": 0.9667664766311646, + "learning_rate": 3.291357077231781e-05, + "loss": 0.8206, + "step": 8690 + }, + { + "epoch": 0.73, + "grad_norm": 0.8043253421783447, + "learning_rate": 3.281522883345843e-05, + "loss": 0.7825, + "step": 8695 + }, + { + "epoch": 0.73, + "grad_norm": 0.9271131157875061, + "learning_rate": 3.271700518618683e-05, + "loss": 0.8616, + "step": 8700 + }, + { + "epoch": 0.74, + "grad_norm": 0.8352097272872925, + "learning_rate": 3.261890000344453e-05, + "loss": 0.9039, + "step": 8705 + }, + { + "epoch": 0.74, + "grad_norm": 0.8335424065589905, + "learning_rate": 3.252091345796432e-05, + "loss": 0.8746, + "step": 8710 + }, + { + "epoch": 0.74, + "grad_norm": 0.693943440914154, + "learning_rate": 3.2423045722270294e-05, + "loss": 0.8702, + "step": 8715 + }, + { + "epoch": 0.74, + "grad_norm": 0.8505630493164062, + "learning_rate": 3.232529696867712e-05, + "loss": 0.8461, + "step": 8720 + }, + { + "epoch": 0.74, + "grad_norm": 0.6869897842407227, + "learning_rate": 3.222766736929013e-05, + "loss": 0.7622, + "step": 8725 + }, + { + "epoch": 0.74, + "grad_norm": 0.82327800989151, + "learning_rate": 3.2130157096004864e-05, + "loss": 0.845, + "step": 8730 + }, + { + "epoch": 0.74, + "grad_norm": 0.6616914868354797, + "learning_rate": 3.203276632050671e-05, + "loss": 0.7665, + "step": 8735 + }, + { + "epoch": 0.74, + "grad_norm": 0.8684021830558777, + "learning_rate": 3.1935495214270705e-05, + "loss": 0.9294, + "step": 8740 + }, + { + "epoch": 0.74, + "grad_norm": 0.9236558675765991, + "learning_rate": 3.1838343948561136e-05, + "loss": 0.8502, + "step": 8745 + }, + { + "epoch": 0.74, + "grad_norm": 0.7163612246513367, + "learning_rate": 3.1741312694431315e-05, + "loss": 0.8604, + "step": 8750 + }, + { + "epoch": 0.74, + "grad_norm": 0.6481351256370544, + "learning_rate": 3.164440162272322e-05, + "loss": 0.9637, + "step": 8755 + }, + { + "epoch": 0.74, + "grad_norm": 0.8455091118812561, + "learning_rate": 3.1547610904067325e-05, + "loss": 0.7494, + "step": 8760 + }, + { + "epoch": 0.74, + "grad_norm": 0.9111529588699341, + "learning_rate": 3.145094070888208e-05, + "loss": 0.9955, + "step": 8765 + }, + { + "epoch": 0.74, + "grad_norm": 0.9329970479011536, + "learning_rate": 3.13543912073738e-05, + "loss": 0.9213, + "step": 8770 + }, + { + "epoch": 0.74, + "grad_norm": 0.8602619767189026, + "learning_rate": 3.125796256953625e-05, + "loss": 1.0095, + "step": 8775 + }, + { + "epoch": 0.74, + "grad_norm": 0.703950047492981, + "learning_rate": 3.1161654965150436e-05, + "loss": 0.7685, + "step": 8780 + }, + { + "epoch": 0.74, + "grad_norm": 0.7916018962860107, + "learning_rate": 3.1065468563784196e-05, + "loss": 0.9332, + "step": 8785 + }, + { + "epoch": 0.74, + "grad_norm": 0.8622072339057922, + "learning_rate": 3.096940353479208e-05, + "loss": 0.8634, + "step": 8790 + }, + { + "epoch": 0.74, + "grad_norm": 0.8001255989074707, + "learning_rate": 3.087346004731485e-05, + "loss": 0.9707, + "step": 8795 + }, + { + "epoch": 0.74, + "grad_norm": 0.8370776176452637, + "learning_rate": 3.077763827027929e-05, + "loss": 0.8443, + "step": 8800 + }, + { + "epoch": 0.74, + "grad_norm": 0.5743516683578491, + "learning_rate": 3.0681938372397865e-05, + "loss": 0.9002, + "step": 8805 + }, + { + "epoch": 0.74, + "grad_norm": 0.7769607901573181, + "learning_rate": 3.0586360522168476e-05, + "loss": 1.0137, + "step": 8810 + }, + { + "epoch": 0.74, + "grad_norm": 0.825899600982666, + "learning_rate": 3.0490904887874183e-05, + "loss": 0.9284, + "step": 8815 + }, + { + "epoch": 0.75, + "grad_norm": 0.7428541779518127, + "learning_rate": 3.039557163758279e-05, + "loss": 0.9896, + "step": 8820 + }, + { + "epoch": 0.75, + "grad_norm": 0.94400954246521, + "learning_rate": 3.030036093914663e-05, + "loss": 0.9276, + "step": 8825 + }, + { + "epoch": 0.75, + "grad_norm": 0.7103738188743591, + "learning_rate": 3.0205272960202292e-05, + "loss": 0.8383, + "step": 8830 + }, + { + "epoch": 0.75, + "grad_norm": 0.8435823321342468, + "learning_rate": 3.0110307868170263e-05, + "loss": 1.0178, + "step": 8835 + }, + { + "epoch": 0.75, + "grad_norm": 0.891716718673706, + "learning_rate": 3.0015465830254663e-05, + "loss": 0.9146, + "step": 8840 + }, + { + "epoch": 0.75, + "grad_norm": 0.7224240303039551, + "learning_rate": 2.9920747013443007e-05, + "loss": 0.9252, + "step": 8845 + }, + { + "epoch": 0.75, + "grad_norm": 0.8182008862495422, + "learning_rate": 2.98261515845058e-05, + "loss": 0.8569, + "step": 8850 + }, + { + "epoch": 0.75, + "grad_norm": 0.8286107182502747, + "learning_rate": 2.9731679709996306e-05, + "loss": 0.8813, + "step": 8855 + }, + { + "epoch": 0.75, + "grad_norm": 0.7946951985359192, + "learning_rate": 2.963733155625026e-05, + "loss": 0.7828, + "step": 8860 + }, + { + "epoch": 0.75, + "grad_norm": 0.7622092366218567, + "learning_rate": 2.954310728938553e-05, + "loss": 0.7407, + "step": 8865 + }, + { + "epoch": 0.75, + "grad_norm": 0.790444016456604, + "learning_rate": 2.944900707530195e-05, + "loss": 0.7441, + "step": 8870 + }, + { + "epoch": 0.75, + "grad_norm": 0.7887465357780457, + "learning_rate": 2.9355031079680827e-05, + "loss": 0.9499, + "step": 8875 + }, + { + "epoch": 0.75, + "grad_norm": 0.8154371976852417, + "learning_rate": 2.9261179467984822e-05, + "loss": 0.948, + "step": 8880 + }, + { + "epoch": 0.75, + "grad_norm": 0.8430317640304565, + "learning_rate": 2.9167452405457562e-05, + "loss": 0.8144, + "step": 8885 + }, + { + "epoch": 0.75, + "grad_norm": 0.72621750831604, + "learning_rate": 2.907385005712341e-05, + "loss": 0.8755, + "step": 8890 + }, + { + "epoch": 0.75, + "grad_norm": 0.8740549683570862, + "learning_rate": 2.8980372587787087e-05, + "loss": 0.8555, + "step": 8895 + }, + { + "epoch": 0.75, + "grad_norm": 0.6749491095542908, + "learning_rate": 2.888702016203354e-05, + "loss": 0.9325, + "step": 8900 + }, + { + "epoch": 0.75, + "grad_norm": 0.7589474320411682, + "learning_rate": 2.879379294422748e-05, + "loss": 0.7657, + "step": 8905 + }, + { + "epoch": 0.75, + "grad_norm": 0.8777940273284912, + "learning_rate": 2.8700691098513188e-05, + "loss": 0.855, + "step": 8910 + }, + { + "epoch": 0.75, + "grad_norm": 0.7081353664398193, + "learning_rate": 2.8607714788814176e-05, + "loss": 0.8862, + "step": 8915 + }, + { + "epoch": 0.75, + "grad_norm": 0.7634384036064148, + "learning_rate": 2.8514864178832967e-05, + "loss": 0.8565, + "step": 8920 + }, + { + "epoch": 0.75, + "grad_norm": 0.7505052089691162, + "learning_rate": 2.842213943205072e-05, + "loss": 0.8969, + "step": 8925 + }, + { + "epoch": 0.75, + "grad_norm": 0.8570383787155151, + "learning_rate": 2.8329540711727054e-05, + "loss": 0.8267, + "step": 8930 + }, + { + "epoch": 0.75, + "grad_norm": 0.9586119055747986, + "learning_rate": 2.823706818089965e-05, + "loss": 0.8691, + "step": 8935 + }, + { + "epoch": 0.76, + "grad_norm": 0.7504484057426453, + "learning_rate": 2.8144722002383993e-05, + "loss": 0.8925, + "step": 8940 + }, + { + "epoch": 0.76, + "grad_norm": 0.7530816197395325, + "learning_rate": 2.8052502338773146e-05, + "loss": 0.8073, + "step": 8945 + }, + { + "epoch": 0.76, + "grad_norm": 0.9067265391349792, + "learning_rate": 2.7960409352437333e-05, + "loss": 0.9125, + "step": 8950 + }, + { + "epoch": 0.76, + "grad_norm": 0.7983117699623108, + "learning_rate": 2.7868443205523888e-05, + "loss": 0.8934, + "step": 8955 + }, + { + "epoch": 0.76, + "grad_norm": 0.8042848706245422, + "learning_rate": 2.777660405995671e-05, + "loss": 0.7423, + "step": 8960 + }, + { + "epoch": 0.76, + "grad_norm": 0.6886625289916992, + "learning_rate": 2.768489207743603e-05, + "loss": 0.8258, + "step": 8965 + }, + { + "epoch": 0.76, + "grad_norm": 0.7667304873466492, + "learning_rate": 2.7593307419438354e-05, + "loss": 0.842, + "step": 8970 + }, + { + "epoch": 0.76, + "grad_norm": 0.7944421172142029, + "learning_rate": 2.7501850247215878e-05, + "loss": 0.8514, + "step": 8975 + }, + { + "epoch": 0.76, + "grad_norm": 0.8095190525054932, + "learning_rate": 2.741052072179636e-05, + "loss": 0.8196, + "step": 8980 + }, + { + "epoch": 0.76, + "grad_norm": 0.6305115818977356, + "learning_rate": 2.7319319003982925e-05, + "loss": 0.8992, + "step": 8985 + }, + { + "epoch": 0.76, + "grad_norm": 0.8835801482200623, + "learning_rate": 2.7228245254353444e-05, + "loss": 0.8358, + "step": 8990 + }, + { + "epoch": 0.76, + "grad_norm": 0.7960600256919861, + "learning_rate": 2.7137299633260638e-05, + "loss": 0.8111, + "step": 8995 + }, + { + "epoch": 0.76, + "grad_norm": 0.8967324495315552, + "learning_rate": 2.7046482300831642e-05, + "loss": 1.0048, + "step": 9000 + }, + { + "epoch": 0.76, + "grad_norm": 0.7498697638511658, + "learning_rate": 2.6955793416967646e-05, + "loss": 0.735, + "step": 9005 + }, + { + "epoch": 0.76, + "grad_norm": 0.8737277388572693, + "learning_rate": 2.686523314134367e-05, + "loss": 0.85, + "step": 9010 + }, + { + "epoch": 0.76, + "grad_norm": 0.8953958749771118, + "learning_rate": 2.6774801633408418e-05, + "loss": 0.886, + "step": 9015 + }, + { + "epoch": 0.76, + "grad_norm": 0.5604196190834045, + "learning_rate": 2.668449905238367e-05, + "loss": 0.8173, + "step": 9020 + }, + { + "epoch": 0.76, + "grad_norm": 0.7240638136863708, + "learning_rate": 2.659432555726441e-05, + "loss": 0.8973, + "step": 9025 + }, + { + "epoch": 0.76, + "grad_norm": 0.7774072885513306, + "learning_rate": 2.6504281306818225e-05, + "loss": 0.8, + "step": 9030 + }, + { + "epoch": 0.76, + "grad_norm": 0.9128009676933289, + "learning_rate": 2.641436645958515e-05, + "loss": 0.8173, + "step": 9035 + }, + { + "epoch": 0.76, + "grad_norm": 0.7718796730041504, + "learning_rate": 2.6324581173877473e-05, + "loss": 0.9214, + "step": 9040 + }, + { + "epoch": 0.76, + "grad_norm": 0.7290648818016052, + "learning_rate": 2.6234925607779215e-05, + "loss": 0.8464, + "step": 9045 + }, + { + "epoch": 0.76, + "grad_norm": 0.7356486320495605, + "learning_rate": 2.6145399919146086e-05, + "loss": 0.8361, + "step": 9050 + }, + { + "epoch": 0.76, + "grad_norm": 0.8046448826789856, + "learning_rate": 2.6056004265605148e-05, + "loss": 0.9615, + "step": 9055 + }, + { + "epoch": 0.77, + "grad_norm": 0.667546272277832, + "learning_rate": 2.596673880455448e-05, + "loss": 0.8548, + "step": 9060 + }, + { + "epoch": 0.77, + "grad_norm": 0.8502492904663086, + "learning_rate": 2.587760369316291e-05, + "loss": 0.8098, + "step": 9065 + }, + { + "epoch": 0.77, + "grad_norm": 0.8480196595191956, + "learning_rate": 2.578859908836979e-05, + "loss": 0.8643, + "step": 9070 + }, + { + "epoch": 0.77, + "grad_norm": 0.7749778628349304, + "learning_rate": 2.569972514688468e-05, + "loss": 0.8322, + "step": 9075 + }, + { + "epoch": 0.77, + "grad_norm": 0.7703043222427368, + "learning_rate": 2.5610982025187046e-05, + "loss": 0.8639, + "step": 9080 + }, + { + "epoch": 0.77, + "grad_norm": 0.9232176542282104, + "learning_rate": 2.552236987952612e-05, + "loss": 0.8019, + "step": 9085 + }, + { + "epoch": 0.77, + "grad_norm": 0.7729461193084717, + "learning_rate": 2.543388886592045e-05, + "loss": 1.0313, + "step": 9090 + }, + { + "epoch": 0.77, + "grad_norm": 0.7758802771568298, + "learning_rate": 2.5345539140157705e-05, + "loss": 0.812, + "step": 9095 + }, + { + "epoch": 0.77, + "grad_norm": 0.7040194272994995, + "learning_rate": 2.5257320857794397e-05, + "loss": 0.7539, + "step": 9100 + }, + { + "epoch": 0.77, + "grad_norm": 0.7888084650039673, + "learning_rate": 2.5169234174155608e-05, + "loss": 0.7946, + "step": 9105 + }, + { + "epoch": 0.77, + "grad_norm": 0.9183518886566162, + "learning_rate": 2.5081279244334764e-05, + "loss": 0.865, + "step": 9110 + }, + { + "epoch": 0.77, + "grad_norm": 0.7420482039451599, + "learning_rate": 2.4993456223193266e-05, + "loss": 0.9083, + "step": 9115 + }, + { + "epoch": 0.77, + "grad_norm": 0.9450098872184753, + "learning_rate": 2.490576526536025e-05, + "loss": 0.9594, + "step": 9120 + }, + { + "epoch": 0.77, + "grad_norm": 0.798826277256012, + "learning_rate": 2.4818206525232356e-05, + "loss": 0.8737, + "step": 9125 + }, + { + "epoch": 0.77, + "grad_norm": 0.8134889602661133, + "learning_rate": 2.4730780156973442e-05, + "loss": 0.9898, + "step": 9130 + }, + { + "epoch": 0.77, + "grad_norm": 0.8726826310157776, + "learning_rate": 2.464348631451424e-05, + "loss": 0.8145, + "step": 9135 + }, + { + "epoch": 0.77, + "grad_norm": 1.1030023097991943, + "learning_rate": 2.455632515155224e-05, + "loss": 0.8348, + "step": 9140 + }, + { + "epoch": 0.77, + "grad_norm": 0.6278375387191772, + "learning_rate": 2.4469296821551257e-05, + "loss": 0.7682, + "step": 9145 + }, + { + "epoch": 0.77, + "grad_norm": 0.7514384388923645, + "learning_rate": 2.4382401477741244e-05, + "loss": 0.9087, + "step": 9150 + }, + { + "epoch": 0.77, + "grad_norm": 1.000822901725769, + "learning_rate": 2.429563927311801e-05, + "loss": 1.0172, + "step": 9155 + }, + { + "epoch": 0.77, + "grad_norm": 0.7753348350524902, + "learning_rate": 2.4209010360442896e-05, + "loss": 0.9404, + "step": 9160 + }, + { + "epoch": 0.77, + "grad_norm": 0.7746703028678894, + "learning_rate": 2.4122514892242677e-05, + "loss": 0.818, + "step": 9165 + }, + { + "epoch": 0.77, + "grad_norm": 0.6258317232131958, + "learning_rate": 2.4036153020809072e-05, + "loss": 0.8354, + "step": 9170 + }, + { + "epoch": 0.78, + "grad_norm": 0.9577354192733765, + "learning_rate": 2.3949924898198604e-05, + "loss": 0.8962, + "step": 9175 + }, + { + "epoch": 0.78, + "grad_norm": 0.7608239650726318, + "learning_rate": 2.3863830676232313e-05, + "loss": 0.8366, + "step": 9180 + }, + { + "epoch": 0.78, + "grad_norm": 0.7837699055671692, + "learning_rate": 2.377787050649547e-05, + "loss": 0.9145, + "step": 9185 + }, + { + "epoch": 0.78, + "grad_norm": 0.7647613286972046, + "learning_rate": 2.36920445403373e-05, + "loss": 0.9256, + "step": 9190 + }, + { + "epoch": 0.78, + "grad_norm": 0.7220770716667175, + "learning_rate": 2.3606352928870835e-05, + "loss": 0.8174, + "step": 9195 + }, + { + "epoch": 0.78, + "grad_norm": 0.8374908566474915, + "learning_rate": 2.352079582297244e-05, + "loss": 0.939, + "step": 9200 + }, + { + "epoch": 0.78, + "grad_norm": 0.8317983150482178, + "learning_rate": 2.34353733732817e-05, + "loss": 0.8696, + "step": 9205 + }, + { + "epoch": 0.78, + "grad_norm": 0.8166177272796631, + "learning_rate": 2.335008573020111e-05, + "loss": 0.9865, + "step": 9210 + }, + { + "epoch": 0.78, + "grad_norm": 0.7623334527015686, + "learning_rate": 2.326493304389582e-05, + "loss": 0.9127, + "step": 9215 + }, + { + "epoch": 0.78, + "grad_norm": 0.8136252164840698, + "learning_rate": 2.3179915464293323e-05, + "loss": 0.8295, + "step": 9220 + }, + { + "epoch": 0.78, + "grad_norm": 0.9206348061561584, + "learning_rate": 2.309503314108331e-05, + "loss": 0.8645, + "step": 9225 + }, + { + "epoch": 0.78, + "grad_norm": 0.8517584800720215, + "learning_rate": 2.301028622371726e-05, + "loss": 0.8817, + "step": 9230 + }, + { + "epoch": 0.78, + "grad_norm": 0.7885266542434692, + "learning_rate": 2.2925674861408264e-05, + "loss": 0.8897, + "step": 9235 + }, + { + "epoch": 0.78, + "grad_norm": 0.8661707639694214, + "learning_rate": 2.2841199203130747e-05, + "loss": 0.9675, + "step": 9240 + }, + { + "epoch": 0.78, + "grad_norm": 0.8700675964355469, + "learning_rate": 2.2756859397620156e-05, + "loss": 0.9892, + "step": 9245 + }, + { + "epoch": 0.78, + "grad_norm": 0.7101337909698486, + "learning_rate": 2.267265559337286e-05, + "loss": 0.8835, + "step": 9250 + }, + { + "epoch": 0.78, + "grad_norm": 0.6243454217910767, + "learning_rate": 2.2588587938645656e-05, + "loss": 0.8324, + "step": 9255 + }, + { + "epoch": 0.78, + "grad_norm": 0.7496815323829651, + "learning_rate": 2.2504656581455665e-05, + "loss": 0.9837, + "step": 9260 + }, + { + "epoch": 0.78, + "grad_norm": 0.8095391392707825, + "learning_rate": 2.242086166958004e-05, + "loss": 0.8328, + "step": 9265 + }, + { + "epoch": 0.78, + "grad_norm": 0.9355162978172302, + "learning_rate": 2.233720335055567e-05, + "loss": 0.8582, + "step": 9270 + }, + { + "epoch": 0.78, + "grad_norm": 0.7414233684539795, + "learning_rate": 2.2253681771678946e-05, + "loss": 0.9332, + "step": 9275 + }, + { + "epoch": 0.78, + "grad_norm": 0.6823673844337463, + "learning_rate": 2.2170297080005564e-05, + "loss": 0.8942, + "step": 9280 + }, + { + "epoch": 0.78, + "grad_norm": 0.8288149833679199, + "learning_rate": 2.208704942235017e-05, + "loss": 0.768, + "step": 9285 + }, + { + "epoch": 0.78, + "grad_norm": 0.8060517311096191, + "learning_rate": 2.200393894528603e-05, + "loss": 0.79, + "step": 9290 + }, + { + "epoch": 0.79, + "grad_norm": 0.7582494020462036, + "learning_rate": 2.1920965795145054e-05, + "loss": 0.9492, + "step": 9295 + }, + { + "epoch": 0.79, + "grad_norm": 0.6934027075767517, + "learning_rate": 2.1838130118017252e-05, + "loss": 0.8737, + "step": 9300 + }, + { + "epoch": 0.79, + "grad_norm": 0.7684122323989868, + "learning_rate": 2.175543205975059e-05, + "loss": 0.8799, + "step": 9305 + }, + { + "epoch": 0.79, + "grad_norm": 0.6786194443702698, + "learning_rate": 2.1672871765950808e-05, + "loss": 0.8591, + "step": 9310 + }, + { + "epoch": 0.79, + "grad_norm": 0.9380123615264893, + "learning_rate": 2.1590449381980993e-05, + "loss": 0.9141, + "step": 9315 + }, + { + "epoch": 0.79, + "grad_norm": 0.7513065338134766, + "learning_rate": 2.150816505296147e-05, + "loss": 0.8632, + "step": 9320 + }, + { + "epoch": 0.79, + "grad_norm": 0.7867124676704407, + "learning_rate": 2.1426018923769464e-05, + "loss": 0.8689, + "step": 9325 + }, + { + "epoch": 0.79, + "grad_norm": 0.7759522199630737, + "learning_rate": 2.1344011139038843e-05, + "loss": 0.9019, + "step": 9330 + }, + { + "epoch": 0.79, + "grad_norm": 0.9177059531211853, + "learning_rate": 2.126214184316002e-05, + "loss": 0.8543, + "step": 9335 + }, + { + "epoch": 0.79, + "grad_norm": 0.7387694716453552, + "learning_rate": 2.1180411180279458e-05, + "loss": 0.9069, + "step": 9340 + }, + { + "epoch": 0.79, + "grad_norm": 0.7109723091125488, + "learning_rate": 2.1098819294299498e-05, + "loss": 0.9439, + "step": 9345 + }, + { + "epoch": 0.79, + "grad_norm": 0.6802003979682922, + "learning_rate": 2.101736632887825e-05, + "loss": 0.772, + "step": 9350 + }, + { + "epoch": 0.79, + "grad_norm": 0.7871384024620056, + "learning_rate": 2.0936052427429186e-05, + "loss": 0.8521, + "step": 9355 + }, + { + "epoch": 0.79, + "grad_norm": 0.7216570377349854, + "learning_rate": 2.085487773312086e-05, + "loss": 0.9107, + "step": 9360 + }, + { + "epoch": 0.79, + "grad_norm": 0.7913159728050232, + "learning_rate": 2.0773842388876884e-05, + "loss": 0.8631, + "step": 9365 + }, + { + "epoch": 0.79, + "grad_norm": 0.7265539765357971, + "learning_rate": 2.0692946537375336e-05, + "loss": 0.8232, + "step": 9370 + }, + { + "epoch": 0.79, + "grad_norm": 0.7547101378440857, + "learning_rate": 2.0612190321048762e-05, + "loss": 0.8404, + "step": 9375 + }, + { + "epoch": 0.79, + "grad_norm": 0.9315720796585083, + "learning_rate": 2.053157388208393e-05, + "loss": 0.8242, + "step": 9380 + }, + { + "epoch": 0.79, + "grad_norm": 0.8665992021560669, + "learning_rate": 2.0451097362421366e-05, + "loss": 0.9618, + "step": 9385 + }, + { + "epoch": 0.79, + "grad_norm": 0.8100541830062866, + "learning_rate": 2.037076090375539e-05, + "loss": 0.8903, + "step": 9390 + }, + { + "epoch": 0.79, + "grad_norm": 0.8045421838760376, + "learning_rate": 2.029056464753363e-05, + "loss": 0.9265, + "step": 9395 + }, + { + "epoch": 0.79, + "grad_norm": 0.6738159656524658, + "learning_rate": 2.021050873495679e-05, + "loss": 0.7856, + "step": 9400 + }, + { + "epoch": 0.79, + "grad_norm": 0.8198833465576172, + "learning_rate": 2.013059330697864e-05, + "loss": 0.9533, + "step": 9405 + }, + { + "epoch": 0.79, + "grad_norm": 0.853940486907959, + "learning_rate": 2.005081850430548e-05, + "loss": 0.836, + "step": 9410 + }, + { + "epoch": 0.8, + "grad_norm": 0.7123111486434937, + "learning_rate": 1.9971184467396022e-05, + "loss": 0.9173, + "step": 9415 + }, + { + "epoch": 0.8, + "grad_norm": 0.7735269069671631, + "learning_rate": 1.989169133646124e-05, + "loss": 0.8409, + "step": 9420 + }, + { + "epoch": 0.8, + "grad_norm": 0.7102484107017517, + "learning_rate": 1.981233925146385e-05, + "loss": 0.8676, + "step": 9425 + }, + { + "epoch": 0.8, + "grad_norm": 0.8679525852203369, + "learning_rate": 1.9733128352118324e-05, + "loss": 0.9388, + "step": 9430 + }, + { + "epoch": 0.8, + "grad_norm": 0.7913849949836731, + "learning_rate": 1.9654058777890573e-05, + "loss": 0.9927, + "step": 9435 + }, + { + "epoch": 0.8, + "grad_norm": 0.7318495512008667, + "learning_rate": 1.9575130667997643e-05, + "loss": 0.8995, + "step": 9440 + }, + { + "epoch": 0.8, + "grad_norm": 0.8687650561332703, + "learning_rate": 1.9496344161407487e-05, + "loss": 1.0084, + "step": 9445 + }, + { + "epoch": 0.8, + "grad_norm": 0.7607772350311279, + "learning_rate": 1.9417699396838764e-05, + "loss": 0.8664, + "step": 9450 + }, + { + "epoch": 0.8, + "grad_norm": 0.7554779052734375, + "learning_rate": 1.9339196512760538e-05, + "loss": 0.8402, + "step": 9455 + }, + { + "epoch": 0.8, + "grad_norm": 0.7785469889640808, + "learning_rate": 1.926083564739215e-05, + "loss": 0.8458, + "step": 9460 + }, + { + "epoch": 0.8, + "grad_norm": 0.7814673781394958, + "learning_rate": 1.9182616938702792e-05, + "loss": 0.858, + "step": 9465 + }, + { + "epoch": 0.8, + "grad_norm": 0.7215919494628906, + "learning_rate": 1.910454052441141e-05, + "loss": 0.883, + "step": 9470 + }, + { + "epoch": 0.8, + "grad_norm": 0.6685999035835266, + "learning_rate": 1.9026606541986393e-05, + "loss": 0.8643, + "step": 9475 + }, + { + "epoch": 0.8, + "grad_norm": 0.6879013776779175, + "learning_rate": 1.894881512864537e-05, + "loss": 0.8209, + "step": 9480 + }, + { + "epoch": 0.8, + "grad_norm": 0.6500614285469055, + "learning_rate": 1.8871166421354924e-05, + "loss": 0.8513, + "step": 9485 + }, + { + "epoch": 0.8, + "grad_norm": 0.8063387274742126, + "learning_rate": 1.879366055683044e-05, + "loss": 0.9353, + "step": 9490 + }, + { + "epoch": 0.8, + "grad_norm": 0.7360300421714783, + "learning_rate": 1.871629767153573e-05, + "loss": 0.9331, + "step": 9495 + }, + { + "epoch": 0.8, + "grad_norm": 0.8143420815467834, + "learning_rate": 1.863907790168289e-05, + "loss": 0.7772, + "step": 9500 + }, + { + "epoch": 0.8, + "grad_norm": 0.8035261034965515, + "learning_rate": 1.8562001383232043e-05, + "loss": 0.7737, + "step": 9505 + }, + { + "epoch": 0.8, + "grad_norm": 0.8917401432991028, + "learning_rate": 1.848506825189107e-05, + "loss": 0.8535, + "step": 9510 + }, + { + "epoch": 0.8, + "grad_norm": 0.862570583820343, + "learning_rate": 1.8408278643115384e-05, + "loss": 0.9848, + "step": 9515 + }, + { + "epoch": 0.8, + "grad_norm": 0.8407866358757019, + "learning_rate": 1.833163269210777e-05, + "loss": 0.8791, + "step": 9520 + }, + { + "epoch": 0.8, + "grad_norm": 0.6715019941329956, + "learning_rate": 1.825513053381801e-05, + "loss": 0.7603, + "step": 9525 + }, + { + "epoch": 0.81, + "grad_norm": 0.7662861943244934, + "learning_rate": 1.8178772302942705e-05, + "loss": 0.9243, + "step": 9530 + }, + { + "epoch": 0.81, + "grad_norm": 0.8278026580810547, + "learning_rate": 1.8102558133925084e-05, + "loss": 0.8764, + "step": 9535 + }, + { + "epoch": 0.81, + "grad_norm": 0.8157891035079956, + "learning_rate": 1.802648816095468e-05, + "loss": 0.8869, + "step": 9540 + }, + { + "epoch": 0.81, + "grad_norm": 0.718010663986206, + "learning_rate": 1.7950562517967217e-05, + "loss": 0.9088, + "step": 9545 + }, + { + "epoch": 0.81, + "grad_norm": 0.6439600586891174, + "learning_rate": 1.787478133864423e-05, + "loss": 0.7075, + "step": 9550 + }, + { + "epoch": 0.81, + "grad_norm": 0.823449969291687, + "learning_rate": 1.779914475641292e-05, + "loss": 0.957, + "step": 9555 + }, + { + "epoch": 0.81, + "grad_norm": 0.7597917318344116, + "learning_rate": 1.7723652904445907e-05, + "loss": 0.8781, + "step": 9560 + }, + { + "epoch": 0.81, + "grad_norm": 0.7375972867012024, + "learning_rate": 1.7648305915660968e-05, + "loss": 0.883, + "step": 9565 + }, + { + "epoch": 0.81, + "grad_norm": 0.7715105414390564, + "learning_rate": 1.75731039227208e-05, + "loss": 0.8056, + "step": 9570 + }, + { + "epoch": 0.81, + "grad_norm": 0.7677664756774902, + "learning_rate": 1.7498047058032896e-05, + "loss": 0.937, + "step": 9575 + }, + { + "epoch": 0.81, + "grad_norm": 0.8163573145866394, + "learning_rate": 1.742313545374914e-05, + "loss": 0.9381, + "step": 9580 + }, + { + "epoch": 0.81, + "grad_norm": 0.7601113319396973, + "learning_rate": 1.7348369241765683e-05, + "loss": 0.8025, + "step": 9585 + }, + { + "epoch": 0.81, + "grad_norm": 0.7613367438316345, + "learning_rate": 1.7273748553722668e-05, + "loss": 0.8191, + "step": 9590 + }, + { + "epoch": 0.81, + "grad_norm": 0.689087450504303, + "learning_rate": 1.7199273521004046e-05, + "loss": 0.8272, + "step": 9595 + }, + { + "epoch": 0.81, + "grad_norm": 0.8311107754707336, + "learning_rate": 1.7124944274737274e-05, + "loss": 1.0169, + "step": 9600 + }, + { + "epoch": 0.81, + "grad_norm": 0.8512225151062012, + "learning_rate": 1.7050760945793187e-05, + "loss": 0.9597, + "step": 9605 + }, + { + "epoch": 0.81, + "grad_norm": 0.7289577126502991, + "learning_rate": 1.6976723664785653e-05, + "loss": 0.7989, + "step": 9610 + }, + { + "epoch": 0.81, + "grad_norm": 0.7322782874107361, + "learning_rate": 1.6902832562071404e-05, + "loss": 0.7628, + "step": 9615 + }, + { + "epoch": 0.81, + "grad_norm": 0.895979642868042, + "learning_rate": 1.682908776774981e-05, + "loss": 0.816, + "step": 9620 + }, + { + "epoch": 0.81, + "grad_norm": 0.7974205017089844, + "learning_rate": 1.6755489411662595e-05, + "loss": 0.9064, + "step": 9625 + }, + { + "epoch": 0.81, + "grad_norm": 0.7401018142700195, + "learning_rate": 1.668203762339373e-05, + "loss": 0.7754, + "step": 9630 + }, + { + "epoch": 0.81, + "grad_norm": 0.812610387802124, + "learning_rate": 1.6608732532269077e-05, + "loss": 0.778, + "step": 9635 + }, + { + "epoch": 0.81, + "grad_norm": 0.7918111085891724, + "learning_rate": 1.6535574267356192e-05, + "loss": 0.9172, + "step": 9640 + }, + { + "epoch": 0.81, + "grad_norm": 0.7501040101051331, + "learning_rate": 1.6462562957464132e-05, + "loss": 0.8736, + "step": 9645 + }, + { + "epoch": 0.82, + "grad_norm": 0.7926919460296631, + "learning_rate": 1.6389698731143242e-05, + "loss": 0.8152, + "step": 9650 + }, + { + "epoch": 0.82, + "grad_norm": 0.7895399928092957, + "learning_rate": 1.631698171668483e-05, + "loss": 0.8256, + "step": 9655 + }, + { + "epoch": 0.82, + "grad_norm": 0.9749563336372375, + "learning_rate": 1.6244412042121105e-05, + "loss": 0.9638, + "step": 9660 + }, + { + "epoch": 0.82, + "grad_norm": 0.8345392346382141, + "learning_rate": 1.61719898352248e-05, + "loss": 0.9168, + "step": 9665 + }, + { + "epoch": 0.82, + "grad_norm": 0.9674035906791687, + "learning_rate": 1.6099715223508937e-05, + "loss": 0.9306, + "step": 9670 + }, + { + "epoch": 0.82, + "grad_norm": 0.8595812320709229, + "learning_rate": 1.6027588334226807e-05, + "loss": 0.9261, + "step": 9675 + }, + { + "epoch": 0.82, + "grad_norm": 0.8274486064910889, + "learning_rate": 1.59556092943715e-05, + "loss": 0.7433, + "step": 9680 + }, + { + "epoch": 0.82, + "grad_norm": 0.7736295461654663, + "learning_rate": 1.5883778230675862e-05, + "loss": 0.8638, + "step": 9685 + }, + { + "epoch": 0.82, + "grad_norm": 0.7375960350036621, + "learning_rate": 1.5812095269612136e-05, + "loss": 0.9223, + "step": 9690 + }, + { + "epoch": 0.82, + "grad_norm": 0.6871909499168396, + "learning_rate": 1.5740560537391858e-05, + "loss": 0.8969, + "step": 9695 + }, + { + "epoch": 0.82, + "grad_norm": 0.9287179708480835, + "learning_rate": 1.5669174159965517e-05, + "loss": 1.0913, + "step": 9700 + }, + { + "epoch": 0.82, + "grad_norm": 0.7640308737754822, + "learning_rate": 1.559793626302245e-05, + "loss": 0.7871, + "step": 9705 + }, + { + "epoch": 0.82, + "grad_norm": 0.9685657024383545, + "learning_rate": 1.5526846971990505e-05, + "loss": 0.7518, + "step": 9710 + }, + { + "epoch": 0.82, + "grad_norm": 0.8307138085365295, + "learning_rate": 1.545590641203599e-05, + "loss": 0.8653, + "step": 9715 + }, + { + "epoch": 0.82, + "grad_norm": 0.839181125164032, + "learning_rate": 1.5385114708063265e-05, + "loss": 0.8401, + "step": 9720 + }, + { + "epoch": 0.82, + "grad_norm": 0.729323148727417, + "learning_rate": 1.531447198471453e-05, + "loss": 0.8694, + "step": 9725 + }, + { + "epoch": 0.82, + "grad_norm": 0.8462833166122437, + "learning_rate": 1.5243978366369837e-05, + "loss": 0.8935, + "step": 9730 + }, + { + "epoch": 0.82, + "grad_norm": 0.8905854225158691, + "learning_rate": 1.5173633977146595e-05, + "loss": 0.8879, + "step": 9735 + }, + { + "epoch": 0.82, + "grad_norm": 0.7661150097846985, + "learning_rate": 1.5103438940899494e-05, + "loss": 0.8359, + "step": 9740 + }, + { + "epoch": 0.82, + "grad_norm": 0.8612456917762756, + "learning_rate": 1.5033393381220329e-05, + "loss": 0.8787, + "step": 9745 + }, + { + "epoch": 0.82, + "grad_norm": 0.8852258920669556, + "learning_rate": 1.4963497421437577e-05, + "loss": 1.0159, + "step": 9750 + }, + { + "epoch": 0.82, + "grad_norm": 0.8979580402374268, + "learning_rate": 1.48937511846164e-05, + "loss": 0.7945, + "step": 9755 + }, + { + "epoch": 0.82, + "grad_norm": 0.7696619033813477, + "learning_rate": 1.4824154793558375e-05, + "loss": 0.8359, + "step": 9760 + }, + { + "epoch": 0.82, + "grad_norm": 0.8621139526367188, + "learning_rate": 1.4754708370801151e-05, + "loss": 0.9384, + "step": 9765 + }, + { + "epoch": 0.83, + "grad_norm": 0.7909301519393921, + "learning_rate": 1.4685412038618473e-05, + "loss": 0.7223, + "step": 9770 + }, + { + "epoch": 0.83, + "grad_norm": 0.8770456314086914, + "learning_rate": 1.4616265919019645e-05, + "loss": 0.7893, + "step": 9775 + }, + { + "epoch": 0.83, + "grad_norm": 0.7560000419616699, + "learning_rate": 1.454727013374959e-05, + "loss": 0.8974, + "step": 9780 + }, + { + "epoch": 0.83, + "grad_norm": 0.8642236590385437, + "learning_rate": 1.4478424804288582e-05, + "loss": 0.9128, + "step": 9785 + }, + { + "epoch": 0.83, + "grad_norm": 0.9646701216697693, + "learning_rate": 1.440973005185191e-05, + "loss": 0.8999, + "step": 9790 + }, + { + "epoch": 0.83, + "grad_norm": 0.790399432182312, + "learning_rate": 1.434118599738975e-05, + "loss": 0.8739, + "step": 9795 + }, + { + "epoch": 0.83, + "grad_norm": 0.8744881749153137, + "learning_rate": 1.427279276158704e-05, + "loss": 1.0425, + "step": 9800 + }, + { + "epoch": 0.83, + "grad_norm": 0.7287259697914124, + "learning_rate": 1.4204550464863021e-05, + "loss": 0.9056, + "step": 9805 + }, + { + "epoch": 0.83, + "grad_norm": 0.8026889562606812, + "learning_rate": 1.4136459227371269e-05, + "loss": 0.9259, + "step": 9810 + }, + { + "epoch": 0.83, + "grad_norm": 0.8606771230697632, + "learning_rate": 1.4068519168999405e-05, + "loss": 0.8604, + "step": 9815 + }, + { + "epoch": 0.83, + "grad_norm": 0.8293012976646423, + "learning_rate": 1.4000730409368845e-05, + "loss": 0.8262, + "step": 9820 + }, + { + "epoch": 0.83, + "grad_norm": 1.039216160774231, + "learning_rate": 1.3933093067834601e-05, + "loss": 0.9428, + "step": 9825 + }, + { + "epoch": 0.83, + "grad_norm": 0.8637626767158508, + "learning_rate": 1.3865607263485091e-05, + "loss": 0.9074, + "step": 9830 + }, + { + "epoch": 0.83, + "grad_norm": 1.1418802738189697, + "learning_rate": 1.3798273115141912e-05, + "loss": 0.8704, + "step": 9835 + }, + { + "epoch": 0.83, + "grad_norm": 0.7517408728599548, + "learning_rate": 1.373109074135972e-05, + "loss": 0.8379, + "step": 9840 + }, + { + "epoch": 0.83, + "grad_norm": 0.9394586682319641, + "learning_rate": 1.3664060260425827e-05, + "loss": 0.8223, + "step": 9845 + }, + { + "epoch": 0.83, + "grad_norm": 0.8001432418823242, + "learning_rate": 1.359718179036019e-05, + "loss": 0.8278, + "step": 9850 + }, + { + "epoch": 0.83, + "grad_norm": 0.7592292428016663, + "learning_rate": 1.353045544891508e-05, + "loss": 0.9601, + "step": 9855 + }, + { + "epoch": 0.83, + "grad_norm": 0.752763569355011, + "learning_rate": 1.3463881353574947e-05, + "loss": 0.7738, + "step": 9860 + }, + { + "epoch": 0.83, + "grad_norm": 0.8289079666137695, + "learning_rate": 1.339745962155613e-05, + "loss": 0.7855, + "step": 9865 + }, + { + "epoch": 0.83, + "grad_norm": 0.8047125339508057, + "learning_rate": 1.33311903698068e-05, + "loss": 0.8015, + "step": 9870 + }, + { + "epoch": 0.83, + "grad_norm": 0.701745331287384, + "learning_rate": 1.326507371500656e-05, + "loss": 0.9886, + "step": 9875 + }, + { + "epoch": 0.83, + "grad_norm": 0.7644436955451965, + "learning_rate": 1.3199109773566387e-05, + "loss": 0.9931, + "step": 9880 + }, + { + "epoch": 0.84, + "grad_norm": 0.7744791507720947, + "learning_rate": 1.3133298661628368e-05, + "loss": 0.8211, + "step": 9885 + }, + { + "epoch": 0.84, + "grad_norm": 0.7323466539382935, + "learning_rate": 1.3067640495065492e-05, + "loss": 0.8264, + "step": 9890 + }, + { + "epoch": 0.84, + "grad_norm": 1.008718729019165, + "learning_rate": 1.3002135389481451e-05, + "loss": 0.9723, + "step": 9895 + }, + { + "epoch": 0.84, + "grad_norm": 0.8565289378166199, + "learning_rate": 1.29367834602105e-05, + "loss": 0.8958, + "step": 9900 + }, + { + "epoch": 0.84, + "grad_norm": 0.6596556901931763, + "learning_rate": 1.2871584822317151e-05, + "loss": 0.8743, + "step": 9905 + }, + { + "epoch": 0.84, + "grad_norm": 0.7944501638412476, + "learning_rate": 1.2806539590596023e-05, + "loss": 0.8661, + "step": 9910 + }, + { + "epoch": 0.84, + "grad_norm": 0.8090604543685913, + "learning_rate": 1.2741647879571627e-05, + "loss": 0.9232, + "step": 9915 + }, + { + "epoch": 0.84, + "grad_norm": 0.7964836955070496, + "learning_rate": 1.2676909803498161e-05, + "loss": 0.9486, + "step": 9920 + }, + { + "epoch": 0.84, + "grad_norm": 1.0404587984085083, + "learning_rate": 1.2612325476359388e-05, + "loss": 1.0251, + "step": 9925 + }, + { + "epoch": 0.84, + "grad_norm": 0.8059197068214417, + "learning_rate": 1.2547895011868304e-05, + "loss": 0.7066, + "step": 9930 + }, + { + "epoch": 0.84, + "grad_norm": 0.7185767292976379, + "learning_rate": 1.2483618523467e-05, + "loss": 0.8999, + "step": 9935 + }, + { + "epoch": 0.84, + "grad_norm": 0.7250891327857971, + "learning_rate": 1.241949612432649e-05, + "loss": 0.898, + "step": 9940 + }, + { + "epoch": 0.84, + "grad_norm": 0.7541928887367249, + "learning_rate": 1.2355527927346478e-05, + "loss": 0.8629, + "step": 9945 + }, + { + "epoch": 0.84, + "grad_norm": 1.006060004234314, + "learning_rate": 1.229171404515511e-05, + "loss": 0.9575, + "step": 9950 + }, + { + "epoch": 0.84, + "grad_norm": 0.737918496131897, + "learning_rate": 1.2228054590108962e-05, + "loss": 0.9269, + "step": 9955 + }, + { + "epoch": 0.84, + "grad_norm": 0.8843221068382263, + "learning_rate": 1.2164549674292581e-05, + "loss": 0.9613, + "step": 9960 + }, + { + "epoch": 0.84, + "grad_norm": 0.7975506782531738, + "learning_rate": 1.2101199409518483e-05, + "loss": 0.8365, + "step": 9965 + }, + { + "epoch": 0.84, + "grad_norm": 0.9914053678512573, + "learning_rate": 1.2038003907326867e-05, + "loss": 0.8665, + "step": 9970 + }, + { + "epoch": 0.84, + "grad_norm": 0.678312361240387, + "learning_rate": 1.1974963278985463e-05, + "loss": 0.8762, + "step": 9975 + }, + { + "epoch": 0.84, + "grad_norm": 0.8215805292129517, + "learning_rate": 1.1912077635489282e-05, + "loss": 0.957, + "step": 9980 + }, + { + "epoch": 0.84, + "grad_norm": 0.8267842531204224, + "learning_rate": 1.1849347087560525e-05, + "loss": 0.9234, + "step": 9985 + }, + { + "epoch": 0.84, + "grad_norm": 0.7704806923866272, + "learning_rate": 1.1786771745648229e-05, + "loss": 0.8277, + "step": 9990 + }, + { + "epoch": 0.84, + "grad_norm": 0.9222090244293213, + "learning_rate": 1.1724351719928228e-05, + "loss": 0.8743, + "step": 9995 + }, + { + "epoch": 0.84, + "grad_norm": 0.7938253879547119, + "learning_rate": 1.1662087120302867e-05, + "loss": 0.8359, + "step": 10000 + }, + { + "epoch": 0.85, + "grad_norm": 0.8336005210876465, + "learning_rate": 1.1599978056400796e-05, + "loss": 0.8933, + "step": 10005 + }, + { + "epoch": 0.85, + "grad_norm": 0.847213864326477, + "learning_rate": 1.1538024637576905e-05, + "loss": 0.9159, + "step": 10010 + }, + { + "epoch": 0.85, + "grad_norm": 0.7625183463096619, + "learning_rate": 1.1476226972911974e-05, + "loss": 0.8969, + "step": 10015 + }, + { + "epoch": 0.85, + "grad_norm": 0.8129417300224304, + "learning_rate": 1.1414585171212555e-05, + "loss": 0.8956, + "step": 10020 + }, + { + "epoch": 0.85, + "grad_norm": 0.7780897617340088, + "learning_rate": 1.1353099341010786e-05, + "loss": 0.8503, + "step": 10025 + }, + { + "epoch": 0.85, + "grad_norm": 0.6232269406318665, + "learning_rate": 1.1291769590564182e-05, + "loss": 0.8895, + "step": 10030 + }, + { + "epoch": 0.85, + "grad_norm": 0.7388679385185242, + "learning_rate": 1.1230596027855434e-05, + "loss": 0.862, + "step": 10035 + }, + { + "epoch": 0.85, + "grad_norm": 0.9394962787628174, + "learning_rate": 1.1169578760592292e-05, + "loss": 0.9012, + "step": 10040 + }, + { + "epoch": 0.85, + "grad_norm": 1.0249824523925781, + "learning_rate": 1.1108717896207276e-05, + "loss": 0.9584, + "step": 10045 + }, + { + "epoch": 0.85, + "grad_norm": 0.6979790329933167, + "learning_rate": 1.1048013541857472e-05, + "loss": 0.8473, + "step": 10050 + }, + { + "epoch": 0.85, + "grad_norm": 0.8743270039558411, + "learning_rate": 1.0987465804424512e-05, + "loss": 0.8449, + "step": 10055 + }, + { + "epoch": 0.85, + "grad_norm": 0.7989283800125122, + "learning_rate": 1.0927074790514203e-05, + "loss": 0.8792, + "step": 10060 + }, + { + "epoch": 0.85, + "grad_norm": 0.7869884371757507, + "learning_rate": 1.0866840606456452e-05, + "loss": 1.0367, + "step": 10065 + }, + { + "epoch": 0.85, + "grad_norm": 0.9411396384239197, + "learning_rate": 1.0806763358305005e-05, + "loss": 0.7985, + "step": 10070 + }, + { + "epoch": 0.85, + "grad_norm": 0.7178574204444885, + "learning_rate": 1.074684315183727e-05, + "loss": 0.8868, + "step": 10075 + }, + { + "epoch": 0.85, + "grad_norm": 0.6789056062698364, + "learning_rate": 1.0687080092554225e-05, + "loss": 0.8484, + "step": 10080 + }, + { + "epoch": 0.85, + "grad_norm": 1.0155320167541504, + "learning_rate": 1.0627474285680105e-05, + "loss": 0.8272, + "step": 10085 + }, + { + "epoch": 0.85, + "grad_norm": 0.7251559495925903, + "learning_rate": 1.0568025836162265e-05, + "loss": 0.8303, + "step": 10090 + }, + { + "epoch": 0.85, + "grad_norm": 0.6140643358230591, + "learning_rate": 1.0508734848671064e-05, + "loss": 0.7968, + "step": 10095 + }, + { + "epoch": 0.85, + "grad_norm": 0.6832136511802673, + "learning_rate": 1.0449601427599588e-05, + "loss": 0.8966, + "step": 10100 + }, + { + "epoch": 0.85, + "grad_norm": 0.7631263732910156, + "learning_rate": 1.0390625677063415e-05, + "loss": 0.8881, + "step": 10105 + }, + { + "epoch": 0.85, + "grad_norm": 0.9187348484992981, + "learning_rate": 1.0331807700900664e-05, + "loss": 0.9736, + "step": 10110 + }, + { + "epoch": 0.85, + "grad_norm": 0.8617235422134399, + "learning_rate": 1.0273147602671562e-05, + "loss": 0.9088, + "step": 10115 + }, + { + "epoch": 0.85, + "grad_norm": 0.6641935110092163, + "learning_rate": 1.0214645485658358e-05, + "loss": 0.8002, + "step": 10120 + }, + { + "epoch": 0.86, + "grad_norm": 0.7935595512390137, + "learning_rate": 1.0156301452865246e-05, + "loss": 0.7914, + "step": 10125 + }, + { + "epoch": 0.86, + "grad_norm": 0.9449029564857483, + "learning_rate": 1.0098115607017922e-05, + "loss": 0.7394, + "step": 10130 + }, + { + "epoch": 0.86, + "grad_norm": 0.8909382224082947, + "learning_rate": 1.00400880505637e-05, + "loss": 0.7768, + "step": 10135 + }, + { + "epoch": 0.86, + "grad_norm": 0.7915505766868591, + "learning_rate": 9.982218885671158e-06, + "loss": 0.8899, + "step": 10140 + }, + { + "epoch": 0.86, + "grad_norm": 0.7059157490730286, + "learning_rate": 9.924508214229933e-06, + "loss": 0.8906, + "step": 10145 + }, + { + "epoch": 0.86, + "grad_norm": 0.8084311485290527, + "learning_rate": 9.866956137850736e-06, + "loss": 0.8016, + "step": 10150 + }, + { + "epoch": 0.86, + "grad_norm": 0.8688581585884094, + "learning_rate": 9.809562757864887e-06, + "loss": 0.9325, + "step": 10155 + }, + { + "epoch": 0.86, + "grad_norm": 0.7226961851119995, + "learning_rate": 9.752328175324366e-06, + "loss": 0.7376, + "step": 10160 + }, + { + "epoch": 0.86, + "grad_norm": 0.9018610119819641, + "learning_rate": 9.695252491001617e-06, + "loss": 0.9814, + "step": 10165 + }, + { + "epoch": 0.86, + "grad_norm": 0.6705411672592163, + "learning_rate": 9.638335805389209e-06, + "loss": 0.9008, + "step": 10170 + }, + { + "epoch": 0.86, + "grad_norm": 0.798916220664978, + "learning_rate": 9.581578218699805e-06, + "loss": 0.969, + "step": 10175 + }, + { + "epoch": 0.86, + "grad_norm": 0.748852550983429, + "learning_rate": 9.524979830865999e-06, + "loss": 0.8754, + "step": 10180 + }, + { + "epoch": 0.86, + "grad_norm": 0.7891843914985657, + "learning_rate": 9.468540741539988e-06, + "loss": 0.844, + "step": 10185 + }, + { + "epoch": 0.86, + "grad_norm": 0.7447881102561951, + "learning_rate": 9.41226105009353e-06, + "loss": 0.8591, + "step": 10190 + }, + { + "epoch": 0.86, + "grad_norm": 0.8944937586784363, + "learning_rate": 9.356140855617778e-06, + "loss": 0.8099, + "step": 10195 + }, + { + "epoch": 0.86, + "grad_norm": 0.8635166883468628, + "learning_rate": 9.30018025692302e-06, + "loss": 0.8817, + "step": 10200 + }, + { + "epoch": 0.86, + "grad_norm": 0.7399436831474304, + "learning_rate": 9.244379352538535e-06, + "loss": 0.8345, + "step": 10205 + }, + { + "epoch": 0.86, + "grad_norm": 0.8072450757026672, + "learning_rate": 9.188738240712447e-06, + "loss": 0.8074, + "step": 10210 + }, + { + "epoch": 0.86, + "grad_norm": 0.7312385439872742, + "learning_rate": 9.133257019411524e-06, + "loss": 0.7803, + "step": 10215 + }, + { + "epoch": 0.86, + "grad_norm": 0.90677410364151, + "learning_rate": 9.077935786321045e-06, + "loss": 0.8638, + "step": 10220 + }, + { + "epoch": 0.86, + "grad_norm": 0.8683536648750305, + "learning_rate": 9.022774638844588e-06, + "loss": 0.9492, + "step": 10225 + }, + { + "epoch": 0.86, + "grad_norm": 0.7896186113357544, + "learning_rate": 8.96777367410383e-06, + "loss": 0.9377, + "step": 10230 + }, + { + "epoch": 0.86, + "grad_norm": 0.7445749640464783, + "learning_rate": 8.912932988938472e-06, + "loss": 0.7302, + "step": 10235 + }, + { + "epoch": 0.87, + "grad_norm": 0.7277476191520691, + "learning_rate": 8.858252679905966e-06, + "loss": 0.7507, + "step": 10240 + }, + { + "epoch": 0.87, + "grad_norm": 0.8207641243934631, + "learning_rate": 8.803732843281409e-06, + "loss": 0.8242, + "step": 10245 + }, + { + "epoch": 0.87, + "grad_norm": 0.8514395952224731, + "learning_rate": 8.749373575057384e-06, + "loss": 1.0068, + "step": 10250 + }, + { + "epoch": 0.87, + "grad_norm": 0.8273018598556519, + "learning_rate": 8.695174970943732e-06, + "loss": 0.927, + "step": 10255 + }, + { + "epoch": 0.87, + "grad_norm": 0.7913432121276855, + "learning_rate": 8.641137126367416e-06, + "loss": 0.8793, + "step": 10260 + }, + { + "epoch": 0.87, + "grad_norm": 0.8091998100280762, + "learning_rate": 8.587260136472353e-06, + "loss": 0.8632, + "step": 10265 + }, + { + "epoch": 0.87, + "grad_norm": 0.8871718645095825, + "learning_rate": 8.53354409611924e-06, + "loss": 0.7877, + "step": 10270 + }, + { + "epoch": 0.87, + "grad_norm": 0.8100157976150513, + "learning_rate": 8.479989099885388e-06, + "loss": 0.8755, + "step": 10275 + }, + { + "epoch": 0.87, + "grad_norm": 0.8143556118011475, + "learning_rate": 8.426595242064606e-06, + "loss": 0.9769, + "step": 10280 + }, + { + "epoch": 0.87, + "grad_norm": 0.8220342993736267, + "learning_rate": 8.373362616666936e-06, + "loss": 1.0094, + "step": 10285 + }, + { + "epoch": 0.87, + "grad_norm": 0.774535596370697, + "learning_rate": 8.320291317418549e-06, + "loss": 0.9654, + "step": 10290 + }, + { + "epoch": 0.87, + "grad_norm": 0.7471094727516174, + "learning_rate": 8.26738143776159e-06, + "loss": 0.9107, + "step": 10295 + }, + { + "epoch": 0.87, + "grad_norm": 0.7431105375289917, + "learning_rate": 8.214633070853938e-06, + "loss": 0.8253, + "step": 10300 + }, + { + "epoch": 0.87, + "grad_norm": 0.7525473237037659, + "learning_rate": 8.162046309569205e-06, + "loss": 0.7647, + "step": 10305 + }, + { + "epoch": 0.87, + "grad_norm": 0.8467829823493958, + "learning_rate": 8.109621246496368e-06, + "loss": 0.9366, + "step": 10310 + }, + { + "epoch": 0.87, + "grad_norm": 0.852018415927887, + "learning_rate": 8.057357973939727e-06, + "loss": 0.9855, + "step": 10315 + }, + { + "epoch": 0.87, + "grad_norm": 0.9554034471511841, + "learning_rate": 8.005256583918763e-06, + "loss": 1.0396, + "step": 10320 + }, + { + "epoch": 0.87, + "grad_norm": 0.7318937182426453, + "learning_rate": 7.953317168167862e-06, + "loss": 0.8658, + "step": 10325 + }, + { + "epoch": 0.87, + "grad_norm": 0.7739142179489136, + "learning_rate": 7.901539818136261e-06, + "loss": 0.8594, + "step": 10330 + }, + { + "epoch": 0.87, + "grad_norm": 0.8797909617424011, + "learning_rate": 7.849924624987881e-06, + "loss": 1.0571, + "step": 10335 + }, + { + "epoch": 0.87, + "grad_norm": 0.8140009045600891, + "learning_rate": 7.798471679601082e-06, + "loss": 0.8524, + "step": 10340 + }, + { + "epoch": 0.87, + "grad_norm": 0.8369308114051819, + "learning_rate": 7.747181072568576e-06, + "loss": 0.8995, + "step": 10345 + }, + { + "epoch": 0.87, + "grad_norm": 0.7931288480758667, + "learning_rate": 7.696052894197247e-06, + "loss": 0.7742, + "step": 10350 + }, + { + "epoch": 0.87, + "grad_norm": 0.7234024405479431, + "learning_rate": 7.645087234507975e-06, + "loss": 0.9128, + "step": 10355 + }, + { + "epoch": 0.88, + "grad_norm": 0.7736112475395203, + "learning_rate": 7.594284183235556e-06, + "loss": 0.8915, + "step": 10360 + }, + { + "epoch": 0.88, + "grad_norm": 0.9592683911323547, + "learning_rate": 7.543643829828406e-06, + "loss": 0.9485, + "step": 10365 + }, + { + "epoch": 0.88, + "grad_norm": 0.9321366548538208, + "learning_rate": 7.493166263448515e-06, + "loss": 0.9154, + "step": 10370 + }, + { + "epoch": 0.88, + "grad_norm": 0.7174406051635742, + "learning_rate": 7.442851572971265e-06, + "loss": 0.924, + "step": 10375 + }, + { + "epoch": 0.88, + "grad_norm": 0.8121606707572937, + "learning_rate": 7.392699846985263e-06, + "loss": 0.8669, + "step": 10380 + }, + { + "epoch": 0.88, + "grad_norm": 0.7170925736427307, + "learning_rate": 7.342711173792127e-06, + "loss": 0.8111, + "step": 10385 + }, + { + "epoch": 0.88, + "grad_norm": 0.7287190556526184, + "learning_rate": 7.2928856414064996e-06, + "loss": 0.8309, + "step": 10390 + }, + { + "epoch": 0.88, + "grad_norm": 0.7759504914283752, + "learning_rate": 7.243223337555693e-06, + "loss": 0.9299, + "step": 10395 + }, + { + "epoch": 0.88, + "grad_norm": 0.7308063507080078, + "learning_rate": 7.193724349679654e-06, + "loss": 0.8408, + "step": 10400 + }, + { + "epoch": 0.88, + "grad_norm": 0.8811477422714233, + "learning_rate": 7.144388764930788e-06, + "loss": 0.9797, + "step": 10405 + }, + { + "epoch": 0.88, + "grad_norm": 0.7261849641799927, + "learning_rate": 7.095216670173776e-06, + "loss": 0.9345, + "step": 10410 + }, + { + "epoch": 0.88, + "grad_norm": 0.6926263570785522, + "learning_rate": 7.046208151985456e-06, + "loss": 0.7932, + "step": 10415 + }, + { + "epoch": 0.88, + "grad_norm": 0.7097862958908081, + "learning_rate": 6.997363296654691e-06, + "loss": 0.8278, + "step": 10420 + }, + { + "epoch": 0.88, + "grad_norm": 0.8528007864952087, + "learning_rate": 6.9486821901821435e-06, + "loss": 0.7973, + "step": 10425 + }, + { + "epoch": 0.88, + "grad_norm": 0.8164514899253845, + "learning_rate": 6.900164918280128e-06, + "loss": 1.0044, + "step": 10430 + }, + { + "epoch": 0.88, + "grad_norm": 0.8210064768791199, + "learning_rate": 6.851811566372601e-06, + "loss": 0.8487, + "step": 10435 + }, + { + "epoch": 0.88, + "grad_norm": 0.8617835640907288, + "learning_rate": 6.8036222195948075e-06, + "loss": 0.8885, + "step": 10440 + }, + { + "epoch": 0.88, + "grad_norm": 0.9348113536834717, + "learning_rate": 6.755596962793309e-06, + "loss": 0.8039, + "step": 10445 + }, + { + "epoch": 0.88, + "grad_norm": 0.8518204689025879, + "learning_rate": 6.707735880525723e-06, + "loss": 0.8798, + "step": 10450 + }, + { + "epoch": 0.88, + "grad_norm": 0.6845257878303528, + "learning_rate": 6.660039057060552e-06, + "loss": 0.8902, + "step": 10455 + }, + { + "epoch": 0.88, + "grad_norm": 0.6822859048843384, + "learning_rate": 6.612506576377175e-06, + "loss": 0.9559, + "step": 10460 + }, + { + "epoch": 0.88, + "grad_norm": 0.8563945889472961, + "learning_rate": 6.565138522165581e-06, + "loss": 0.8785, + "step": 10465 + }, + { + "epoch": 0.88, + "grad_norm": 0.7450621724128723, + "learning_rate": 6.517934977826223e-06, + "loss": 0.7979, + "step": 10470 + }, + { + "epoch": 0.88, + "grad_norm": 0.9026947021484375, + "learning_rate": 6.4708960264699745e-06, + "loss": 0.9681, + "step": 10475 + }, + { + "epoch": 0.89, + "grad_norm": 0.8436296582221985, + "learning_rate": 6.424021750917864e-06, + "loss": 0.9227, + "step": 10480 + }, + { + "epoch": 0.89, + "grad_norm": 0.8324162364006042, + "learning_rate": 6.377312233700938e-06, + "loss": 0.8586, + "step": 10485 + }, + { + "epoch": 0.89, + "grad_norm": 0.7005321979522705, + "learning_rate": 6.3307675570602354e-06, + "loss": 0.8165, + "step": 10490 + }, + { + "epoch": 0.89, + "grad_norm": 0.7949735522270203, + "learning_rate": 6.284387802946534e-06, + "loss": 0.8962, + "step": 10495 + }, + { + "epoch": 0.89, + "grad_norm": 0.8014736175537109, + "learning_rate": 6.238173053020191e-06, + "loss": 0.8234, + "step": 10500 + }, + { + "epoch": 0.89, + "grad_norm": 0.8914954662322998, + "learning_rate": 6.192123388651128e-06, + "loss": 0.8268, + "step": 10505 + }, + { + "epoch": 0.89, + "grad_norm": 0.8167476654052734, + "learning_rate": 6.146238890918488e-06, + "loss": 0.8591, + "step": 10510 + }, + { + "epoch": 0.89, + "grad_norm": 0.7341588139533997, + "learning_rate": 6.100519640610725e-06, + "loss": 0.841, + "step": 10515 + }, + { + "epoch": 0.89, + "grad_norm": 0.6818287968635559, + "learning_rate": 6.054965718225258e-06, + "loss": 0.7915, + "step": 10520 + }, + { + "epoch": 0.89, + "grad_norm": 0.7893515825271606, + "learning_rate": 6.009577203968453e-06, + "loss": 0.8643, + "step": 10525 + }, + { + "epoch": 0.89, + "grad_norm": 0.8433229923248291, + "learning_rate": 5.964354177755449e-06, + "loss": 0.8659, + "step": 10530 + }, + { + "epoch": 0.89, + "grad_norm": 0.8310180902481079, + "learning_rate": 5.919296719209988e-06, + "loss": 0.8752, + "step": 10535 + }, + { + "epoch": 0.89, + "grad_norm": 0.6668066382408142, + "learning_rate": 5.874404907664277e-06, + "loss": 0.812, + "step": 10540 + }, + { + "epoch": 0.89, + "grad_norm": 0.7034140229225159, + "learning_rate": 5.8296788221589575e-06, + "loss": 0.8203, + "step": 10545 + }, + { + "epoch": 0.89, + "grad_norm": 0.8013192415237427, + "learning_rate": 5.785118541442791e-06, + "loss": 0.8858, + "step": 10550 + }, + { + "epoch": 0.89, + "grad_norm": 0.8860305547714233, + "learning_rate": 5.740724143972642e-06, + "loss": 0.8153, + "step": 10555 + }, + { + "epoch": 0.89, + "grad_norm": 0.8523777723312378, + "learning_rate": 5.6964957079133186e-06, + "loss": 0.8841, + "step": 10560 + }, + { + "epoch": 0.89, + "grad_norm": 0.8978105187416077, + "learning_rate": 5.652433311137384e-06, + "loss": 0.8173, + "step": 10565 + }, + { + "epoch": 0.89, + "grad_norm": 0.8412472605705261, + "learning_rate": 5.608537031225092e-06, + "loss": 0.8595, + "step": 10570 + }, + { + "epoch": 0.89, + "grad_norm": 0.767458975315094, + "learning_rate": 5.564806945464218e-06, + "loss": 0.7613, + "step": 10575 + }, + { + "epoch": 0.89, + "grad_norm": 0.7909321784973145, + "learning_rate": 5.521243130849873e-06, + "loss": 0.9713, + "step": 10580 + }, + { + "epoch": 0.89, + "grad_norm": 0.975814700126648, + "learning_rate": 5.4778456640845135e-06, + "loss": 0.8448, + "step": 10585 + }, + { + "epoch": 0.89, + "grad_norm": 0.8089463710784912, + "learning_rate": 5.434614621577594e-06, + "loss": 0.8023, + "step": 10590 + }, + { + "epoch": 0.89, + "grad_norm": 0.876457691192627, + "learning_rate": 5.391550079445606e-06, + "loss": 0.961, + "step": 10595 + }, + { + "epoch": 0.9, + "grad_norm": 0.852300763130188, + "learning_rate": 5.348652113511898e-06, + "loss": 0.8673, + "step": 10600 + }, + { + "epoch": 0.9, + "grad_norm": 0.857282280921936, + "learning_rate": 5.305920799306496e-06, + "loss": 0.7921, + "step": 10605 + }, + { + "epoch": 0.9, + "grad_norm": 0.715785026550293, + "learning_rate": 5.263356212066028e-06, + "loss": 0.8206, + "step": 10610 + }, + { + "epoch": 0.9, + "grad_norm": 0.8727372884750366, + "learning_rate": 5.220958426733558e-06, + "loss": 0.9199, + "step": 10615 + }, + { + "epoch": 0.9, + "grad_norm": 0.9024982452392578, + "learning_rate": 5.178727517958459e-06, + "loss": 0.9166, + "step": 10620 + }, + { + "epoch": 0.9, + "grad_norm": 0.7073826789855957, + "learning_rate": 5.136663560096277e-06, + "loss": 0.7656, + "step": 10625 + }, + { + "epoch": 0.9, + "grad_norm": 0.7534424662590027, + "learning_rate": 5.094766627208647e-06, + "loss": 0.8346, + "step": 10630 + }, + { + "epoch": 0.9, + "grad_norm": 0.9219887256622314, + "learning_rate": 5.053036793063093e-06, + "loss": 0.935, + "step": 10635 + }, + { + "epoch": 0.9, + "grad_norm": 0.8320545554161072, + "learning_rate": 5.011474131132931e-06, + "loss": 0.8014, + "step": 10640 + }, + { + "epoch": 0.9, + "grad_norm": 0.6234966516494751, + "learning_rate": 4.970078714597149e-06, + "loss": 0.8109, + "step": 10645 + }, + { + "epoch": 0.9, + "grad_norm": 0.7838702201843262, + "learning_rate": 4.928850616340252e-06, + "loss": 0.9262, + "step": 10650 + }, + { + "epoch": 0.9, + "grad_norm": 0.717758297920227, + "learning_rate": 4.887789908952178e-06, + "loss": 0.9535, + "step": 10655 + }, + { + "epoch": 0.9, + "grad_norm": 0.8258304595947266, + "learning_rate": 4.846896664728118e-06, + "loss": 0.8687, + "step": 10660 + }, + { + "epoch": 0.9, + "grad_norm": 0.7627803087234497, + "learning_rate": 4.806170955668421e-06, + "loss": 0.7835, + "step": 10665 + }, + { + "epoch": 0.9, + "grad_norm": 0.8373433947563171, + "learning_rate": 4.765612853478451e-06, + "loss": 0.8576, + "step": 10670 + }, + { + "epoch": 0.9, + "grad_norm": 1.0411616563796997, + "learning_rate": 4.725222429568477e-06, + "loss": 0.8517, + "step": 10675 + }, + { + "epoch": 0.9, + "grad_norm": 0.8107260465621948, + "learning_rate": 4.68499975505351e-06, + "loss": 0.9144, + "step": 10680 + }, + { + "epoch": 0.9, + "grad_norm": 0.9633600115776062, + "learning_rate": 4.644944900753278e-06, + "loss": 0.9668, + "step": 10685 + }, + { + "epoch": 0.9, + "grad_norm": 0.7689321041107178, + "learning_rate": 4.605057937191947e-06, + "loss": 1.0446, + "step": 10690 + }, + { + "epoch": 0.9, + "grad_norm": 0.7285904884338379, + "learning_rate": 4.565338934598129e-06, + "loss": 0.9144, + "step": 10695 + }, + { + "epoch": 0.9, + "grad_norm": 0.8424079418182373, + "learning_rate": 4.525787962904682e-06, + "loss": 0.7431, + "step": 10700 + }, + { + "epoch": 0.9, + "grad_norm": 0.9980275630950928, + "learning_rate": 4.4864050917486355e-06, + "loss": 0.972, + "step": 10705 + }, + { + "epoch": 0.9, + "grad_norm": 0.8323859572410583, + "learning_rate": 4.447190390471024e-06, + "loss": 0.9221, + "step": 10710 + }, + { + "epoch": 0.91, + "grad_norm": 0.8923182487487793, + "learning_rate": 4.408143928116815e-06, + "loss": 0.9211, + "step": 10715 + }, + { + "epoch": 0.91, + "grad_norm": 0.8041788935661316, + "learning_rate": 4.369265773434739e-06, + "loss": 0.7976, + "step": 10720 + }, + { + "epoch": 0.91, + "grad_norm": 0.7583939433097839, + "learning_rate": 4.330555994877195e-06, + "loss": 0.8283, + "step": 10725 + }, + { + "epoch": 0.91, + "grad_norm": 0.8751514554023743, + "learning_rate": 4.292014660600119e-06, + "loss": 0.8757, + "step": 10730 + }, + { + "epoch": 0.91, + "grad_norm": 0.8682853579521179, + "learning_rate": 4.253641838462852e-06, + "loss": 0.9022, + "step": 10735 + }, + { + "epoch": 0.91, + "grad_norm": 0.7690489292144775, + "learning_rate": 4.2154375960280935e-06, + "loss": 0.8395, + "step": 10740 + }, + { + "epoch": 0.91, + "grad_norm": 0.8148819208145142, + "learning_rate": 4.17740200056167e-06, + "loss": 0.9871, + "step": 10745 + }, + { + "epoch": 0.91, + "grad_norm": 0.759973406791687, + "learning_rate": 4.139535119032501e-06, + "loss": 0.8988, + "step": 10750 + }, + { + "epoch": 0.91, + "grad_norm": 0.7451571822166443, + "learning_rate": 4.1018370181124424e-06, + "loss": 0.9156, + "step": 10755 + }, + { + "epoch": 0.91, + "grad_norm": 0.6766397356987, + "learning_rate": 4.064307764176168e-06, + "loss": 0.8459, + "step": 10760 + }, + { + "epoch": 0.91, + "grad_norm": 0.6828172206878662, + "learning_rate": 4.0269474233010865e-06, + "loss": 0.8727, + "step": 10765 + }, + { + "epoch": 0.91, + "grad_norm": 0.8449336886405945, + "learning_rate": 3.9897560612672136e-06, + "loss": 0.83, + "step": 10770 + }, + { + "epoch": 0.91, + "grad_norm": 0.9624890089035034, + "learning_rate": 3.9527337435570025e-06, + "loss": 0.8118, + "step": 10775 + }, + { + "epoch": 0.91, + "grad_norm": 0.9531586170196533, + "learning_rate": 3.915880535355298e-06, + "loss": 0.9792, + "step": 10780 + }, + { + "epoch": 0.91, + "grad_norm": 1.0197491645812988, + "learning_rate": 3.879196501549209e-06, + "loss": 0.8994, + "step": 10785 + }, + { + "epoch": 0.91, + "grad_norm": 0.8006296157836914, + "learning_rate": 3.842681706727957e-06, + "loss": 0.9336, + "step": 10790 + }, + { + "epoch": 0.91, + "grad_norm": 0.7368683218955994, + "learning_rate": 3.806336215182782e-06, + "loss": 0.7437, + "step": 10795 + }, + { + "epoch": 0.91, + "grad_norm": 0.7810891270637512, + "learning_rate": 3.7701600909068714e-06, + "loss": 0.9383, + "step": 10800 + }, + { + "epoch": 0.91, + "grad_norm": 0.8878799080848694, + "learning_rate": 3.734153397595164e-06, + "loss": 0.8256, + "step": 10805 + }, + { + "epoch": 0.91, + "grad_norm": 0.8740106225013733, + "learning_rate": 3.6983161986443027e-06, + "loss": 0.8673, + "step": 10810 + }, + { + "epoch": 0.91, + "grad_norm": 0.9431246519088745, + "learning_rate": 3.662648557152515e-06, + "loss": 0.8343, + "step": 10815 + }, + { + "epoch": 0.91, + "grad_norm": 0.7177392840385437, + "learning_rate": 3.6271505359194547e-06, + "loss": 0.8108, + "step": 10820 + }, + { + "epoch": 0.91, + "grad_norm": 0.8011447191238403, + "learning_rate": 3.591822197446182e-06, + "loss": 0.8757, + "step": 10825 + }, + { + "epoch": 0.91, + "grad_norm": 0.8898225426673889, + "learning_rate": 3.556663603934951e-06, + "loss": 0.8689, + "step": 10830 + }, + { + "epoch": 0.92, + "grad_norm": 0.8089674115180969, + "learning_rate": 3.5216748172891446e-06, + "loss": 0.9874, + "step": 10835 + }, + { + "epoch": 0.92, + "grad_norm": 0.8062359690666199, + "learning_rate": 3.486855899113217e-06, + "loss": 0.8446, + "step": 10840 + }, + { + "epoch": 0.92, + "grad_norm": 0.7981634140014648, + "learning_rate": 3.4522069107124966e-06, + "loss": 0.871, + "step": 10845 + }, + { + "epoch": 0.92, + "grad_norm": 0.6979275345802307, + "learning_rate": 3.4177279130931163e-06, + "loss": 0.8024, + "step": 10850 + }, + { + "epoch": 0.92, + "grad_norm": 1.0290021896362305, + "learning_rate": 3.3834189669619377e-06, + "loss": 0.8975, + "step": 10855 + }, + { + "epoch": 0.92, + "grad_norm": 0.7928755879402161, + "learning_rate": 3.3492801327263843e-06, + "loss": 0.8094, + "step": 10860 + }, + { + "epoch": 0.92, + "grad_norm": 0.7724294662475586, + "learning_rate": 3.3153114704943756e-06, + "loss": 1.0105, + "step": 10865 + }, + { + "epoch": 0.92, + "grad_norm": 1.1200708150863647, + "learning_rate": 3.2815130400742133e-06, + "loss": 0.8083, + "step": 10870 + }, + { + "epoch": 0.92, + "grad_norm": 0.9042993187904358, + "learning_rate": 3.247884900974474e-06, + "loss": 0.7872, + "step": 10875 + }, + { + "epoch": 0.92, + "grad_norm": 0.6799546480178833, + "learning_rate": 3.214427112403906e-06, + "loss": 0.8935, + "step": 10880 + }, + { + "epoch": 0.92, + "grad_norm": 1.0057941675186157, + "learning_rate": 3.181139733271332e-06, + "loss": 0.8105, + "step": 10885 + }, + { + "epoch": 0.92, + "grad_norm": 0.8763043880462646, + "learning_rate": 3.1480228221854923e-06, + "loss": 0.7507, + "step": 10890 + }, + { + "epoch": 0.92, + "grad_norm": 0.6605707406997681, + "learning_rate": 3.1150764374550443e-06, + "loss": 0.8297, + "step": 10895 + }, + { + "epoch": 0.92, + "grad_norm": 0.8191012740135193, + "learning_rate": 3.0823006370883534e-06, + "loss": 0.8562, + "step": 10900 + }, + { + "epoch": 0.92, + "grad_norm": 0.8007928729057312, + "learning_rate": 3.0496954787934684e-06, + "loss": 0.8344, + "step": 10905 + }, + { + "epoch": 0.92, + "grad_norm": 0.7685137987136841, + "learning_rate": 3.0172610199780017e-06, + "loss": 0.7817, + "step": 10910 + }, + { + "epoch": 0.92, + "grad_norm": 0.9101439714431763, + "learning_rate": 2.984997317748972e-06, + "loss": 0.926, + "step": 10915 + }, + { + "epoch": 0.92, + "grad_norm": 0.7492097020149231, + "learning_rate": 2.9529044289127726e-06, + "loss": 0.6755, + "step": 10920 + }, + { + "epoch": 0.92, + "grad_norm": 0.9658187031745911, + "learning_rate": 2.9209824099750595e-06, + "loss": 0.8649, + "step": 10925 + }, + { + "epoch": 0.92, + "grad_norm": 0.8430820107460022, + "learning_rate": 2.889231317140617e-06, + "loss": 0.9128, + "step": 10930 + }, + { + "epoch": 0.92, + "grad_norm": 0.8035411834716797, + "learning_rate": 2.857651206313305e-06, + "loss": 0.8344, + "step": 10935 + }, + { + "epoch": 0.92, + "grad_norm": 0.7803851366043091, + "learning_rate": 2.8262421330959244e-06, + "loss": 0.9662, + "step": 10940 + }, + { + "epoch": 0.92, + "grad_norm": 0.8594342470169067, + "learning_rate": 2.795004152790115e-06, + "loss": 0.8618, + "step": 10945 + }, + { + "epoch": 0.92, + "grad_norm": 0.937749445438385, + "learning_rate": 2.7639373203963036e-06, + "loss": 0.9277, + "step": 10950 + }, + { + "epoch": 0.93, + "grad_norm": 0.787311315536499, + "learning_rate": 2.7330416906135582e-06, + "loss": 0.8844, + "step": 10955 + }, + { + "epoch": 0.93, + "grad_norm": 0.7901485562324524, + "learning_rate": 2.702317317839531e-06, + "loss": 0.7633, + "step": 10960 + }, + { + "epoch": 0.93, + "grad_norm": 0.7819672226905823, + "learning_rate": 2.6717642561703505e-06, + "loss": 0.8706, + "step": 10965 + }, + { + "epoch": 0.93, + "grad_norm": 0.719790518283844, + "learning_rate": 2.6413825594004625e-06, + "loss": 0.8467, + "step": 10970 + }, + { + "epoch": 0.93, + "grad_norm": 0.7518222332000732, + "learning_rate": 2.611172281022645e-06, + "loss": 0.807, + "step": 10975 + }, + { + "epoch": 0.93, + "grad_norm": 0.837917149066925, + "learning_rate": 2.5811334742278593e-06, + "loss": 0.9382, + "step": 10980 + }, + { + "epoch": 0.93, + "grad_norm": 0.7309818267822266, + "learning_rate": 2.551266191905133e-06, + "loss": 0.8529, + "step": 10985 + }, + { + "epoch": 0.93, + "grad_norm": 0.7382307648658752, + "learning_rate": 2.5215704866415224e-06, + "loss": 0.709, + "step": 10990 + }, + { + "epoch": 0.93, + "grad_norm": 0.8184390068054199, + "learning_rate": 2.492046410721971e-06, + "loss": 0.8557, + "step": 10995 + }, + { + "epoch": 0.93, + "grad_norm": 0.9037685394287109, + "learning_rate": 2.4626940161292187e-06, + "loss": 1.1225, + "step": 11000 + }, + { + "epoch": 0.93, + "grad_norm": 0.8128904104232788, + "learning_rate": 2.4335133545437596e-06, + "loss": 0.8341, + "step": 11005 + }, + { + "epoch": 0.93, + "grad_norm": 0.7361746430397034, + "learning_rate": 2.4045044773437163e-06, + "loss": 0.8179, + "step": 11010 + }, + { + "epoch": 0.93, + "grad_norm": 0.8453391194343567, + "learning_rate": 2.3756674356047338e-06, + "loss": 0.8738, + "step": 11015 + }, + { + "epoch": 0.93, + "grad_norm": 0.8270261287689209, + "learning_rate": 2.3470022800999193e-06, + "loss": 0.8572, + "step": 11020 + }, + { + "epoch": 0.93, + "grad_norm": 0.7221106290817261, + "learning_rate": 2.318509061299745e-06, + "loss": 0.8055, + "step": 11025 + }, + { + "epoch": 0.93, + "grad_norm": 0.8942533731460571, + "learning_rate": 2.2901878293719257e-06, + "loss": 0.933, + "step": 11030 + }, + { + "epoch": 0.93, + "grad_norm": 0.7586672306060791, + "learning_rate": 2.2620386341814182e-06, + "loss": 0.8081, + "step": 11035 + }, + { + "epoch": 0.93, + "grad_norm": 0.9604291915893555, + "learning_rate": 2.234061525290232e-06, + "loss": 0.8651, + "step": 11040 + }, + { + "epoch": 0.93, + "grad_norm": 0.8790186047554016, + "learning_rate": 2.2062565519573865e-06, + "loss": 0.9874, + "step": 11045 + }, + { + "epoch": 0.93, + "grad_norm": 0.8522576093673706, + "learning_rate": 2.1786237631388428e-06, + "loss": 0.9064, + "step": 11050 + }, + { + "epoch": 0.93, + "grad_norm": 0.6957898139953613, + "learning_rate": 2.1511632074873835e-06, + "loss": 0.8559, + "step": 11055 + }, + { + "epoch": 0.93, + "grad_norm": 1.2120394706726074, + "learning_rate": 2.1238749333525543e-06, + "loss": 0.8321, + "step": 11060 + }, + { + "epoch": 0.93, + "grad_norm": 0.7582048177719116, + "learning_rate": 2.096758988780556e-06, + "loss": 0.9462, + "step": 11065 + }, + { + "epoch": 0.94, + "grad_norm": 0.7362944483757019, + "learning_rate": 2.069815421514176e-06, + "loss": 0.7978, + "step": 11070 + }, + { + "epoch": 0.94, + "grad_norm": 0.7996507287025452, + "learning_rate": 2.0430442789927007e-06, + "loss": 0.8505, + "step": 11075 + }, + { + "epoch": 0.94, + "grad_norm": 0.8071786165237427, + "learning_rate": 2.0164456083518246e-06, + "loss": 0.8272, + "step": 11080 + }, + { + "epoch": 0.94, + "grad_norm": 0.9571120738983154, + "learning_rate": 1.990019456423564e-06, + "loss": 0.8708, + "step": 11085 + }, + { + "epoch": 0.94, + "grad_norm": 0.9792050719261169, + "learning_rate": 1.9637658697362003e-06, + "loss": 0.9641, + "step": 11090 + }, + { + "epoch": 0.94, + "grad_norm": 0.6688560247421265, + "learning_rate": 1.93768489451418e-06, + "loss": 0.7361, + "step": 11095 + }, + { + "epoch": 0.94, + "grad_norm": 0.6988593339920044, + "learning_rate": 1.911776576678015e-06, + "loss": 0.806, + "step": 11100 + }, + { + "epoch": 0.94, + "grad_norm": 0.8763333559036255, + "learning_rate": 1.8860409618442488e-06, + "loss": 0.8753, + "step": 11105 + }, + { + "epoch": 0.94, + "grad_norm": 0.8303632736206055, + "learning_rate": 1.8604780953253353e-06, + "loss": 0.8662, + "step": 11110 + }, + { + "epoch": 0.94, + "grad_norm": 0.8031095862388611, + "learning_rate": 1.8350880221295496e-06, + "loss": 0.7915, + "step": 11115 + }, + { + "epoch": 0.94, + "grad_norm": 0.7835865616798401, + "learning_rate": 1.8098707869609654e-06, + "loss": 0.8461, + "step": 11120 + }, + { + "epoch": 0.94, + "grad_norm": 0.7007308006286621, + "learning_rate": 1.7848264342193333e-06, + "loss": 1.0948, + "step": 11125 + }, + { + "epoch": 0.94, + "grad_norm": 0.8703559041023254, + "learning_rate": 1.7599550080000027e-06, + "loss": 0.8635, + "step": 11130 + }, + { + "epoch": 0.94, + "grad_norm": 0.87107914686203, + "learning_rate": 1.7352565520938558e-06, + "loss": 0.8839, + "step": 11135 + }, + { + "epoch": 0.94, + "grad_norm": 0.6888677477836609, + "learning_rate": 1.7107311099872403e-06, + "loss": 0.858, + "step": 11140 + }, + { + "epoch": 0.94, + "grad_norm": 0.7759958505630493, + "learning_rate": 1.6863787248618367e-06, + "loss": 1.0134, + "step": 11145 + }, + { + "epoch": 0.94, + "grad_norm": 0.7978636026382446, + "learning_rate": 1.6621994395946916e-06, + "loss": 0.8704, + "step": 11150 + }, + { + "epoch": 0.94, + "grad_norm": 0.716876208782196, + "learning_rate": 1.6381932967580505e-06, + "loss": 0.9225, + "step": 11155 + }, + { + "epoch": 0.94, + "grad_norm": 0.7568153142929077, + "learning_rate": 1.6143603386192474e-06, + "loss": 0.8241, + "step": 11160 + }, + { + "epoch": 0.94, + "grad_norm": 0.8619349598884583, + "learning_rate": 1.5907006071408049e-06, + "loss": 0.9491, + "step": 11165 + }, + { + "epoch": 0.94, + "grad_norm": 0.7976325750350952, + "learning_rate": 1.5672141439801446e-06, + "loss": 0.8012, + "step": 11170 + }, + { + "epoch": 0.94, + "grad_norm": 0.7966151237487793, + "learning_rate": 1.5439009904896773e-06, + "loss": 0.9116, + "step": 11175 + }, + { + "epoch": 0.94, + "grad_norm": 0.7527498602867126, + "learning_rate": 1.5207611877166573e-06, + "loss": 0.853, + "step": 11180 + }, + { + "epoch": 0.94, + "grad_norm": 0.7213166356086731, + "learning_rate": 1.4977947764031053e-06, + "loss": 0.7906, + "step": 11185 + }, + { + "epoch": 0.95, + "grad_norm": 0.8000203371047974, + "learning_rate": 1.4750017969857643e-06, + "loss": 0.805, + "step": 11190 + }, + { + "epoch": 0.95, + "grad_norm": 0.8495139479637146, + "learning_rate": 1.4523822895960216e-06, + "loss": 0.7696, + "step": 11195 + }, + { + "epoch": 0.95, + "grad_norm": 0.8969099521636963, + "learning_rate": 1.4299362940598194e-06, + "loss": 0.8998, + "step": 11200 + }, + { + "epoch": 0.95, + "grad_norm": 0.8963848948478699, + "learning_rate": 1.4076638498976113e-06, + "loss": 0.8154, + "step": 11205 + }, + { + "epoch": 0.95, + "grad_norm": 0.816973090171814, + "learning_rate": 1.3855649963242957e-06, + "loss": 0.8815, + "step": 11210 + }, + { + "epoch": 0.95, + "grad_norm": 0.6730329990386963, + "learning_rate": 1.3636397722490813e-06, + "loss": 0.8261, + "step": 11215 + }, + { + "epoch": 0.95, + "grad_norm": 0.7854148745536804, + "learning_rate": 1.3418882162755219e-06, + "loss": 0.8484, + "step": 11220 + }, + { + "epoch": 0.95, + "grad_norm": 0.8766478300094604, + "learning_rate": 1.3203103667013827e-06, + "loss": 0.9359, + "step": 11225 + }, + { + "epoch": 0.95, + "grad_norm": 0.9398655891418457, + "learning_rate": 1.298906261518551e-06, + "loss": 0.716, + "step": 11230 + }, + { + "epoch": 0.95, + "grad_norm": 0.9365181922912598, + "learning_rate": 1.2776759384130698e-06, + "loss": 0.9044, + "step": 11235 + }, + { + "epoch": 0.95, + "grad_norm": 0.782059371471405, + "learning_rate": 1.2566194347649385e-06, + "loss": 0.9254, + "step": 11240 + }, + { + "epoch": 0.95, + "grad_norm": 0.7425671815872192, + "learning_rate": 1.2357367876481452e-06, + "loss": 0.8266, + "step": 11245 + }, + { + "epoch": 0.95, + "grad_norm": 0.8430447578430176, + "learning_rate": 1.2150280338305787e-06, + "loss": 0.8314, + "step": 11250 + }, + { + "epoch": 0.95, + "grad_norm": 0.7275057435035706, + "learning_rate": 1.194493209773928e-06, + "loss": 0.7711, + "step": 11255 + }, + { + "epoch": 0.95, + "grad_norm": 0.758685827255249, + "learning_rate": 1.1741323516336832e-06, + "loss": 0.8514, + "step": 11260 + }, + { + "epoch": 0.95, + "grad_norm": 0.6766619086265564, + "learning_rate": 1.1539454952590123e-06, + "loss": 0.835, + "step": 11265 + }, + { + "epoch": 0.95, + "grad_norm": 0.7155946493148804, + "learning_rate": 1.133932676192695e-06, + "loss": 0.8194, + "step": 11270 + }, + { + "epoch": 0.95, + "grad_norm": 0.7995068430900574, + "learning_rate": 1.114093929671145e-06, + "loss": 0.9196, + "step": 11275 + }, + { + "epoch": 0.95, + "grad_norm": 0.7955594062805176, + "learning_rate": 1.0944292906242326e-06, + "loss": 0.8761, + "step": 11280 + }, + { + "epoch": 0.95, + "grad_norm": 0.7788894772529602, + "learning_rate": 1.0749387936753064e-06, + "loss": 0.9241, + "step": 11285 + }, + { + "epoch": 0.95, + "grad_norm": 0.790854275226593, + "learning_rate": 1.0556224731411157e-06, + "loss": 0.8612, + "step": 11290 + }, + { + "epoch": 0.95, + "grad_norm": 0.7776708602905273, + "learning_rate": 1.0364803630316887e-06, + "loss": 0.8391, + "step": 11295 + }, + { + "epoch": 0.95, + "grad_norm": 0.8150311708450317, + "learning_rate": 1.017512497050377e-06, + "loss": 0.7578, + "step": 11300 + }, + { + "epoch": 0.95, + "grad_norm": 0.6092957854270935, + "learning_rate": 9.98718908593732e-07, + "loss": 0.8103, + "step": 11305 + }, + { + "epoch": 0.96, + "grad_norm": 0.9293496012687683, + "learning_rate": 9.8009963075143e-07, + "loss": 0.8924, + "step": 11310 + }, + { + "epoch": 0.96, + "grad_norm": 0.9524035453796387, + "learning_rate": 9.61654696306258e-07, + "loss": 0.8824, + "step": 11315 + }, + { + "epoch": 0.96, + "grad_norm": 0.8459351062774658, + "learning_rate": 9.43384137734038e-07, + "loss": 0.8594, + "step": 11320 + }, + { + "epoch": 0.96, + "grad_norm": 0.8590577840805054, + "learning_rate": 9.252879872035713e-07, + "loss": 0.7377, + "step": 11325 + }, + { + "epoch": 0.96, + "grad_norm": 0.7393367290496826, + "learning_rate": 9.073662765765823e-07, + "loss": 0.7557, + "step": 11330 + }, + { + "epoch": 0.96, + "grad_norm": 0.9165522456169128, + "learning_rate": 8.896190374076518e-07, + "loss": 0.9628, + "step": 11335 + }, + { + "epoch": 0.96, + "grad_norm": 0.8148025870323181, + "learning_rate": 8.720463009441626e-07, + "loss": 0.8027, + "step": 11340 + }, + { + "epoch": 0.96, + "grad_norm": 0.8349050879478455, + "learning_rate": 8.546480981262872e-07, + "loss": 0.9549, + "step": 11345 + }, + { + "epoch": 0.96, + "grad_norm": 0.8241772651672363, + "learning_rate": 8.374244595868664e-07, + "loss": 0.9792, + "step": 11350 + }, + { + "epoch": 0.96, + "grad_norm": 0.815220057964325, + "learning_rate": 8.203754156513865e-07, + "loss": 0.9256, + "step": 11355 + }, + { + "epoch": 0.96, + "grad_norm": 0.7551809549331665, + "learning_rate": 8.03500996337958e-07, + "loss": 0.8234, + "step": 11360 + }, + { + "epoch": 0.96, + "grad_norm": 1.0061076879501343, + "learning_rate": 7.868012313571927e-07, + "loss": 0.8945, + "step": 11365 + }, + { + "epoch": 0.96, + "grad_norm": 0.9475626945495605, + "learning_rate": 7.702761501122147e-07, + "loss": 0.8082, + "step": 11370 + }, + { + "epoch": 0.96, + "grad_norm": 0.9202659726142883, + "learning_rate": 7.539257816985835e-07, + "loss": 0.8168, + "step": 11375 + }, + { + "epoch": 0.96, + "grad_norm": 0.7858511805534363, + "learning_rate": 7.377501549042265e-07, + "loss": 0.8055, + "step": 11380 + }, + { + "epoch": 0.96, + "grad_norm": 0.8680155873298645, + "learning_rate": 7.217492982094176e-07, + "loss": 0.8777, + "step": 11385 + }, + { + "epoch": 0.96, + "grad_norm": 0.7543894648551941, + "learning_rate": 7.059232397867099e-07, + "loss": 0.8607, + "step": 11390 + }, + { + "epoch": 0.96, + "grad_norm": 0.7095973491668701, + "learning_rate": 6.902720075009139e-07, + "loss": 0.8256, + "step": 11395 + }, + { + "epoch": 0.96, + "grad_norm": 0.7466055750846863, + "learning_rate": 6.747956289089863e-07, + "loss": 0.9458, + "step": 11400 + }, + { + "epoch": 0.96, + "grad_norm": 0.7379010319709778, + "learning_rate": 6.594941312600411e-07, + "loss": 0.9594, + "step": 11405 + }, + { + "epoch": 0.96, + "grad_norm": 0.7673978805541992, + "learning_rate": 6.443675414952833e-07, + "loss": 0.9929, + "step": 11410 + }, + { + "epoch": 0.96, + "grad_norm": 1.0383026599884033, + "learning_rate": 6.294158862479527e-07, + "loss": 0.8045, + "step": 11415 + }, + { + "epoch": 0.96, + "grad_norm": 0.7899762988090515, + "learning_rate": 6.146391918433026e-07, + "loss": 0.8108, + "step": 11420 + }, + { + "epoch": 0.97, + "grad_norm": 0.889194667339325, + "learning_rate": 6.000374842984991e-07, + "loss": 1.0163, + "step": 11425 + }, + { + "epoch": 0.97, + "grad_norm": 0.7975661754608154, + "learning_rate": 5.856107893226325e-07, + "loss": 0.8313, + "step": 11430 + }, + { + "epoch": 0.97, + "grad_norm": 0.87502521276474, + "learning_rate": 5.713591323166622e-07, + "loss": 0.8928, + "step": 11435 + }, + { + "epoch": 0.97, + "grad_norm": 1.1591724157333374, + "learning_rate": 5.57282538373316e-07, + "loss": 0.9258, + "step": 11440 + }, + { + "epoch": 0.97, + "grad_norm": 0.7677488923072815, + "learning_rate": 5.433810322771571e-07, + "loss": 0.8604, + "step": 11445 + }, + { + "epoch": 0.97, + "grad_norm": 0.7682551145553589, + "learning_rate": 5.296546385044065e-07, + "loss": 0.8226, + "step": 11450 + }, + { + "epoch": 0.97, + "grad_norm": 0.8181256651878357, + "learning_rate": 5.161033812229987e-07, + "loss": 1.0031, + "step": 11455 + }, + { + "epoch": 0.97, + "grad_norm": 0.7503376007080078, + "learning_rate": 5.027272842925146e-07, + "loss": 0.8191, + "step": 11460 + }, + { + "epoch": 0.97, + "grad_norm": 0.8176915049552917, + "learning_rate": 4.895263712641151e-07, + "loss": 0.9518, + "step": 11465 + }, + { + "epoch": 0.97, + "grad_norm": 0.8447414636611938, + "learning_rate": 4.7650066538051927e-07, + "loss": 0.8723, + "step": 11470 + }, + { + "epoch": 0.97, + "grad_norm": 0.8194155693054199, + "learning_rate": 4.636501895759704e-07, + "loss": 0.8594, + "step": 11475 + }, + { + "epoch": 0.97, + "grad_norm": 0.8020860552787781, + "learning_rate": 4.5097496647616977e-07, + "loss": 0.8447, + "step": 11480 + }, + { + "epoch": 0.97, + "grad_norm": 1.0051360130310059, + "learning_rate": 4.3847501839827666e-07, + "loss": 0.9012, + "step": 11485 + }, + { + "epoch": 0.97, + "grad_norm": 0.7807974219322205, + "learning_rate": 4.261503673508194e-07, + "loss": 0.8135, + "step": 11490 + }, + { + "epoch": 0.97, + "grad_norm": 0.769763708114624, + "learning_rate": 4.1400103503368425e-07, + "loss": 0.8548, + "step": 11495 + }, + { + "epoch": 0.97, + "grad_norm": 1.161005973815918, + "learning_rate": 4.0202704283810456e-07, + "loss": 0.8857, + "step": 11500 + }, + { + "epoch": 0.97, + "grad_norm": 0.8864768743515015, + "learning_rate": 3.9022841184657155e-07, + "loss": 0.7329, + "step": 11505 + }, + { + "epoch": 0.97, + "grad_norm": 0.6769604086875916, + "learning_rate": 3.7860516283282355e-07, + "loss": 0.859, + "step": 11510 + }, + { + "epoch": 0.97, + "grad_norm": 0.6905530095100403, + "learning_rate": 3.6715731626179027e-07, + "loss": 0.8472, + "step": 11515 + }, + { + "epoch": 0.97, + "grad_norm": 0.7534658312797546, + "learning_rate": 3.55884892289593e-07, + "loss": 0.7692, + "step": 11520 + }, + { + "epoch": 0.97, + "grad_norm": 0.6859395503997803, + "learning_rate": 3.447879107634888e-07, + "loss": 0.7706, + "step": 11525 + }, + { + "epoch": 0.97, + "grad_norm": 0.847043514251709, + "learning_rate": 3.338663912218265e-07, + "loss": 0.7983, + "step": 11530 + }, + { + "epoch": 0.97, + "grad_norm": 0.7656576037406921, + "learning_rate": 3.23120352894013e-07, + "loss": 0.9127, + "step": 11535 + }, + { + "epoch": 0.97, + "grad_norm": 0.7100486159324646, + "learning_rate": 3.1254981470049126e-07, + "loss": 0.8365, + "step": 11540 + }, + { + "epoch": 0.98, + "grad_norm": 0.9396764636039734, + "learning_rate": 3.021547952527293e-07, + "loss": 0.822, + "step": 11545 + }, + { + "epoch": 0.98, + "grad_norm": 0.9337235689163208, + "learning_rate": 2.9193531285311993e-07, + "loss": 0.9381, + "step": 11550 + }, + { + "epoch": 0.98, + "grad_norm": 0.9499861001968384, + "learning_rate": 2.818913854950256e-07, + "loss": 0.891, + "step": 11555 + }, + { + "epoch": 0.98, + "grad_norm": 0.6093025803565979, + "learning_rate": 2.720230308626781e-07, + "loss": 0.7486, + "step": 11560 + }, + { + "epoch": 0.98, + "grad_norm": 0.7417643666267395, + "learning_rate": 2.6233026633118994e-07, + "loss": 0.7791, + "step": 11565 + }, + { + "epoch": 0.98, + "grad_norm": 0.7814109921455383, + "learning_rate": 2.528131089665431e-07, + "loss": 0.7943, + "step": 11570 + }, + { + "epoch": 0.98, + "grad_norm": 0.8193375468254089, + "learning_rate": 2.4347157552548907e-07, + "loss": 0.982, + "step": 11575 + }, + { + "epoch": 0.98, + "grad_norm": 0.8868688344955444, + "learning_rate": 2.3430568245558227e-07, + "loss": 1.0064, + "step": 11580 + }, + { + "epoch": 0.98, + "grad_norm": 1.0020816326141357, + "learning_rate": 2.2531544589512454e-07, + "loss": 0.8038, + "step": 11585 + }, + { + "epoch": 0.98, + "grad_norm": 0.6277792453765869, + "learning_rate": 2.1650088167313177e-07, + "loss": 0.6911, + "step": 11590 + }, + { + "epoch": 0.98, + "grad_norm": 0.8221533894538879, + "learning_rate": 2.0786200530933387e-07, + "loss": 0.9594, + "step": 11595 + }, + { + "epoch": 0.98, + "grad_norm": 0.751361072063446, + "learning_rate": 1.9939883201410826e-07, + "loss": 0.8091, + "step": 11600 + }, + { + "epoch": 0.98, + "grad_norm": 0.835889458656311, + "learning_rate": 1.911113766884909e-07, + "loss": 0.803, + "step": 11605 + }, + { + "epoch": 0.98, + "grad_norm": 0.8068159818649292, + "learning_rate": 1.8299965392413187e-07, + "loss": 0.8209, + "step": 11610 + }, + { + "epoch": 0.98, + "grad_norm": 0.8454260230064392, + "learning_rate": 1.7506367800325108e-07, + "loss": 0.8195, + "step": 11615 + }, + { + "epoch": 0.98, + "grad_norm": 0.6896167993545532, + "learning_rate": 1.6730346289864918e-07, + "loss": 0.9279, + "step": 11620 + }, + { + "epoch": 0.98, + "grad_norm": 0.8151208162307739, + "learning_rate": 1.597190222736633e-07, + "loss": 0.8927, + "step": 11625 + }, + { + "epoch": 0.98, + "grad_norm": 0.7260826230049133, + "learning_rate": 1.5231036948215594e-07, + "loss": 0.8088, + "step": 11630 + }, + { + "epoch": 0.98, + "grad_norm": 1.191455364227295, + "learning_rate": 1.4507751756845934e-07, + "loss": 0.9637, + "step": 11635 + }, + { + "epoch": 0.98, + "grad_norm": 0.6957030892372131, + "learning_rate": 1.380204792673867e-07, + "loss": 0.9675, + "step": 11640 + }, + { + "epoch": 0.98, + "grad_norm": 0.9041639566421509, + "learning_rate": 1.3113926700420998e-07, + "loss": 0.9778, + "step": 11645 + }, + { + "epoch": 0.98, + "grad_norm": 0.7494191527366638, + "learning_rate": 1.2443389289460427e-07, + "loss": 0.7918, + "step": 11650 + }, + { + "epoch": 0.98, + "grad_norm": 0.8155191540718079, + "learning_rate": 1.1790436874465904e-07, + "loss": 0.8427, + "step": 11655 + }, + { + "epoch": 0.98, + "grad_norm": 0.8684887886047363, + "learning_rate": 1.1155070605085583e-07, + "loss": 0.85, + "step": 11660 + }, + { + "epoch": 0.99, + "grad_norm": 0.9394787549972534, + "learning_rate": 1.0537291600000165e-07, + "loss": 0.82, + "step": 11665 + }, + { + "epoch": 0.99, + "grad_norm": 0.829444169998169, + "learning_rate": 9.937100946930677e-08, + "loss": 0.739, + "step": 11670 + }, + { + "epoch": 0.99, + "grad_norm": 0.7598903775215149, + "learning_rate": 9.354499702625141e-08, + "loss": 0.7957, + "step": 11675 + }, + { + "epoch": 0.99, + "grad_norm": 0.6364439725875854, + "learning_rate": 8.789488892864129e-08, + "loss": 0.7983, + "step": 11680 + }, + { + "epoch": 0.99, + "grad_norm": 0.8551909923553467, + "learning_rate": 8.242069512456318e-08, + "loss": 0.8569, + "step": 11685 + }, + { + "epoch": 0.99, + "grad_norm": 0.8468892574310303, + "learning_rate": 7.71224252523961e-08, + "loss": 0.8113, + "step": 11690 + }, + { + "epoch": 0.99, + "grad_norm": 0.7935190200805664, + "learning_rate": 7.200008864073349e-08, + "loss": 0.909, + "step": 11695 + }, + { + "epoch": 0.99, + "grad_norm": 0.8427583575248718, + "learning_rate": 6.705369430843878e-08, + "loss": 0.8116, + "step": 11700 + }, + { + "epoch": 0.99, + "grad_norm": 0.6899325847625732, + "learning_rate": 6.228325096457876e-08, + "loss": 0.6711, + "step": 11705 + }, + { + "epoch": 0.99, + "grad_norm": 1.1237952709197998, + "learning_rate": 5.7688767008423627e-08, + "loss": 0.9356, + "step": 11710 + }, + { + "epoch": 0.99, + "grad_norm": 0.6556943655014038, + "learning_rate": 5.327025052943579e-08, + "loss": 0.7901, + "step": 11715 + }, + { + "epoch": 0.99, + "grad_norm": 0.796808660030365, + "learning_rate": 4.902770930725886e-08, + "loss": 0.8344, + "step": 11720 + }, + { + "epoch": 0.99, + "grad_norm": 0.8386369943618774, + "learning_rate": 4.4961150811695384e-08, + "loss": 0.9573, + "step": 11725 + }, + { + "epoch": 0.99, + "grad_norm": 0.8265208005905151, + "learning_rate": 4.107058220270687e-08, + "loss": 0.9486, + "step": 11730 + }, + { + "epoch": 0.99, + "grad_norm": 0.7974259257316589, + "learning_rate": 3.735601033035829e-08, + "loss": 0.8705, + "step": 11735 + }, + { + "epoch": 0.99, + "grad_norm": 0.97637540102005, + "learning_rate": 3.3817441734862455e-08, + "loss": 0.8278, + "step": 11740 + }, + { + "epoch": 0.99, + "grad_norm": 0.6852146983146667, + "learning_rate": 3.045488264656893e-08, + "loss": 0.7963, + "step": 11745 + }, + { + "epoch": 0.99, + "grad_norm": 0.8473973274230957, + "learning_rate": 2.7268338985875218e-08, + "loss": 0.843, + "step": 11750 + }, + { + "epoch": 0.99, + "grad_norm": 0.7919003367424011, + "learning_rate": 2.4257816363326692e-08, + "loss": 0.8883, + "step": 11755 + }, + { + "epoch": 0.99, + "grad_norm": 0.7065966129302979, + "learning_rate": 2.1423320079494435e-08, + "loss": 0.8417, + "step": 11760 + }, + { + "epoch": 0.99, + "grad_norm": 0.7676900625228882, + "learning_rate": 1.8764855125052993e-08, + "loss": 0.9791, + "step": 11765 + }, + { + "epoch": 0.99, + "grad_norm": 0.8119387626647949, + "learning_rate": 1.6282426180758148e-08, + "loss": 0.9255, + "step": 11770 + }, + { + "epoch": 0.99, + "grad_norm": 1.060429573059082, + "learning_rate": 1.3976037617380311e-08, + "loss": 1.0282, + "step": 11775 + }, + { + "epoch": 1.0, + "grad_norm": 0.7375126481056213, + "learning_rate": 1.1845693495760035e-08, + "loss": 0.7632, + "step": 11780 + }, + { + "epoch": 1.0, + "grad_norm": 1.090340495109558, + "learning_rate": 9.891397566774708e-09, + "loss": 0.7868, + "step": 11785 + }, + { + "epoch": 1.0, + "grad_norm": 0.8043749928474426, + "learning_rate": 8.113153271327446e-09, + "loss": 0.8244, + "step": 11790 + }, + { + "epoch": 1.0, + "grad_norm": 0.781341552734375, + "learning_rate": 6.510963740369303e-09, + "loss": 0.7928, + "step": 11795 + }, + { + "epoch": 1.0, + "grad_norm": 0.6983230710029602, + "learning_rate": 5.08483179485486e-09, + "loss": 0.9473, + "step": 11800 + }, + { + "epoch": 1.0, + "grad_norm": 0.7805584669113159, + "learning_rate": 3.83475994575333e-09, + "loss": 0.825, + "step": 11805 + }, + { + "epoch": 1.0, + "grad_norm": 0.7910088300704956, + "learning_rate": 2.7607503940707546e-09, + "loss": 0.8383, + "step": 11810 + }, + { + "epoch": 1.0, + "grad_norm": 0.7440575361251831, + "learning_rate": 1.862805030783399e-09, + "loss": 0.899, + "step": 11815 + }, + { + "epoch": 1.0, + "grad_norm": 0.762624204158783, + "learning_rate": 1.1409254369154632e-09, + "loss": 0.8825, + "step": 11820 + }, + { + "epoch": 1.0, + "grad_norm": 0.8033168315887451, + "learning_rate": 5.951128834613684e-10, + "loss": 0.8778, + "step": 11825 + }, + { + "epoch": 1.0, + "grad_norm": 0.8624210953712463, + "learning_rate": 2.2536833143016467e-10, + "loss": 0.8045, + "step": 11830 + }, + { + "epoch": 1.0, + "grad_norm": 0.7830139398574829, + "learning_rate": 3.169243183442916e-11, + "loss": 1.0231, + "step": 11835 + }, + { + "epoch": 1.0, + "step": 11838, + "total_flos": 6.314593069983334e+16, + "train_loss": 0.0, + "train_runtime": 0.0111, + "train_samples_per_second": 1063397.814, + "train_steps_per_second": 1063397.814 + } + ], + "logging_steps": 5, + "max_steps": 11838, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 6.314593069983334e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}