diff --git "a/checkpoint-2500/trainer_state.json" "b/checkpoint-2500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2500/trainer_state.json" @@ -0,0 +1,4408 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5125628140703515, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004020100502512563, + "grad_norm": 0.68359375, + "learning_rate": 3.9999999999999996e-05, + "loss": 0.6758, + "step": 4 + }, + { + "epoch": 0.008040201005025126, + "grad_norm": 0.6328125, + "learning_rate": 7.999999999999999e-05, + "loss": 0.6607, + "step": 8 + }, + { + "epoch": 0.012060301507537688, + "grad_norm": 0.466796875, + "learning_rate": 0.00011999999999999999, + "loss": 0.6948, + "step": 12 + }, + { + "epoch": 0.016080402010050253, + "grad_norm": 0.40234375, + "learning_rate": 0.00015999999999999999, + "loss": 0.6473, + "step": 16 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.400390625, + "learning_rate": 0.00019999999999999998, + "loss": 0.6259, + "step": 20 + }, + { + "epoch": 0.024120603015075376, + "grad_norm": 0.453125, + "learning_rate": 0.00023999999999999998, + "loss": 0.677, + "step": 24 + }, + { + "epoch": 0.02814070351758794, + "grad_norm": 0.4375, + "learning_rate": 0.00028, + "loss": 0.6508, + "step": 28 + }, + { + "epoch": 0.032160804020100506, + "grad_norm": 0.36328125, + "learning_rate": 0.00029999966091711776, + "loss": 0.649, + "step": 32 + }, + { + "epoch": 0.036180904522613064, + "grad_norm": 0.376953125, + "learning_rate": 0.000299996948263258, + "loss": 0.645, + "step": 36 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.42578125, + "learning_rate": 0.0002999915230045952, + "loss": 0.6616, + "step": 40 + }, + { + "epoch": 0.044221105527638194, + "grad_norm": 0.388671875, + "learning_rate": 0.00029998338523924196, + "loss": 0.6724, + "step": 44 + }, + { + "epoch": 0.04824120603015075, + "grad_norm": 0.388671875, + "learning_rate": 0.0002999725351143648, + "loss": 0.6506, + "step": 48 + }, + { + "epoch": 0.05226130653266332, + "grad_norm": 0.39453125, + "learning_rate": 0.00029995897282618177, + "loss": 0.6499, + "step": 52 + }, + { + "epoch": 0.05628140703517588, + "grad_norm": 0.40234375, + "learning_rate": 0.0002999426986199587, + "loss": 0.6445, + "step": 56 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.439453125, + "learning_rate": 0.00029992371279000487, + "loss": 0.6445, + "step": 60 + }, + { + "epoch": 0.06432160804020101, + "grad_norm": 0.427734375, + "learning_rate": 0.0002999020156796676, + "loss": 0.6495, + "step": 64 + }, + { + "epoch": 0.06834170854271357, + "grad_norm": 0.404296875, + "learning_rate": 0.0002998776076813261, + "loss": 0.6212, + "step": 68 + }, + { + "epoch": 0.07236180904522613, + "grad_norm": 0.390625, + "learning_rate": 0.0002998504892363843, + "loss": 0.6622, + "step": 72 + }, + { + "epoch": 0.0763819095477387, + "grad_norm": 0.35546875, + "learning_rate": 0.0002998206608352632, + "loss": 0.6706, + "step": 76 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.39453125, + "learning_rate": 0.0002997881230173914, + "loss": 0.6146, + "step": 80 + }, + { + "epoch": 0.08442211055276382, + "grad_norm": 0.384765625, + "learning_rate": 0.00029975287637119585, + "loss": 0.6458, + "step": 84 + }, + { + "epoch": 0.08844221105527639, + "grad_norm": 0.40625, + "learning_rate": 0.0002997149215340909, + "loss": 0.6771, + "step": 88 + }, + { + "epoch": 0.09246231155778895, + "grad_norm": 0.419921875, + "learning_rate": 0.0002996742591924671, + "loss": 0.6559, + "step": 92 + }, + { + "epoch": 0.0964824120603015, + "grad_norm": 0.396484375, + "learning_rate": 0.00029963089008167856, + "loss": 0.6041, + "step": 96 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.388671875, + "learning_rate": 0.0002995848149860295, + "loss": 0.6464, + "step": 100 + }, + { + "epoch": 0.10452261306532663, + "grad_norm": 0.373046875, + "learning_rate": 0.0002995360347387604, + "loss": 0.6513, + "step": 104 + }, + { + "epoch": 0.10854271356783919, + "grad_norm": 0.419921875, + "learning_rate": 0.00029948455022203285, + "loss": 0.6714, + "step": 108 + }, + { + "epoch": 0.11256281407035176, + "grad_norm": 0.392578125, + "learning_rate": 0.00029943036236691333, + "loss": 0.623, + "step": 112 + }, + { + "epoch": 0.11658291457286432, + "grad_norm": 0.42578125, + "learning_rate": 0.00029937347215335674, + "loss": 0.6691, + "step": 116 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.369140625, + "learning_rate": 0.00029931388061018845, + "loss": 0.6512, + "step": 120 + }, + { + "epoch": 0.12462311557788945, + "grad_norm": 0.357421875, + "learning_rate": 0.00029925158881508577, + "loss": 0.679, + "step": 124 + }, + { + "epoch": 0.12864321608040202, + "grad_norm": 0.41015625, + "learning_rate": 0.0002991865978945584, + "loss": 0.6164, + "step": 128 + }, + { + "epoch": 0.13266331658291458, + "grad_norm": 0.380859375, + "learning_rate": 0.0002991189090239282, + "loss": 0.6462, + "step": 132 + }, + { + "epoch": 0.13668341708542714, + "grad_norm": 0.3984375, + "learning_rate": 0.00029904852342730774, + "loss": 0.6567, + "step": 136 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.4375, + "learning_rate": 0.0002989754423775783, + "loss": 0.6519, + "step": 140 + }, + { + "epoch": 0.14472361809045226, + "grad_norm": 0.41796875, + "learning_rate": 0.00029889966719636677, + "loss": 0.6049, + "step": 144 + }, + { + "epoch": 0.1487437185929648, + "grad_norm": 0.38671875, + "learning_rate": 0.0002988211992540219, + "loss": 0.6298, + "step": 148 + }, + { + "epoch": 0.1527638190954774, + "grad_norm": 0.3671875, + "learning_rate": 0.0002987400399695893, + "loss": 0.6358, + "step": 152 + }, + { + "epoch": 0.15678391959798996, + "grad_norm": 0.3828125, + "learning_rate": 0.000298656190810786, + "loss": 0.6588, + "step": 156 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.375, + "learning_rate": 0.00029856965329397365, + "loss": 0.6834, + "step": 160 + }, + { + "epoch": 0.16482412060301507, + "grad_norm": 0.404296875, + "learning_rate": 0.0002984804289841313, + "loss": 0.6619, + "step": 164 + }, + { + "epoch": 0.16884422110552763, + "grad_norm": 0.375, + "learning_rate": 0.0002983885194948271, + "loss": 0.6345, + "step": 168 + }, + { + "epoch": 0.1728643216080402, + "grad_norm": 0.392578125, + "learning_rate": 0.000298293926488189, + "loss": 0.6452, + "step": 172 + }, + { + "epoch": 0.17688442211055277, + "grad_norm": 0.37109375, + "learning_rate": 0.0002981966516748748, + "loss": 0.6378, + "step": 176 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.375, + "learning_rate": 0.00029809669681404107, + "loss": 0.6233, + "step": 180 + }, + { + "epoch": 0.1849246231155779, + "grad_norm": 0.4296875, + "learning_rate": 0.00029799406371331153, + "loss": 0.6583, + "step": 184 + }, + { + "epoch": 0.18894472361809045, + "grad_norm": 0.3671875, + "learning_rate": 0.0002978887542287442, + "loss": 0.6488, + "step": 188 + }, + { + "epoch": 0.192964824120603, + "grad_norm": 0.388671875, + "learning_rate": 0.0002977807702647979, + "loss": 0.6394, + "step": 192 + }, + { + "epoch": 0.19698492462311556, + "grad_norm": 0.365234375, + "learning_rate": 0.00029767011377429786, + "loss": 0.6069, + "step": 196 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.359375, + "learning_rate": 0.00029755678675840027, + "loss": 0.6155, + "step": 200 + }, + { + "epoch": 0.2050251256281407, + "grad_norm": 0.3828125, + "learning_rate": 0.0002974407912665563, + "loss": 0.6305, + "step": 204 + }, + { + "epoch": 0.20904522613065327, + "grad_norm": 0.369140625, + "learning_rate": 0.0002973221293964747, + "loss": 0.6617, + "step": 208 + }, + { + "epoch": 0.21306532663316582, + "grad_norm": 0.38671875, + "learning_rate": 0.00029720080329408426, + "loss": 0.6611, + "step": 212 + }, + { + "epoch": 0.21708542713567838, + "grad_norm": 0.3671875, + "learning_rate": 0.0002970768151534947, + "loss": 0.6054, + "step": 216 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.392578125, + "learning_rate": 0.0002969501672169571, + "loss": 0.6981, + "step": 220 + }, + { + "epoch": 0.22512562814070353, + "grad_norm": 0.35546875, + "learning_rate": 0.00029682086177482353, + "loss": 0.6005, + "step": 224 + }, + { + "epoch": 0.22914572864321608, + "grad_norm": 0.40234375, + "learning_rate": 0.00029668890116550526, + "loss": 0.6184, + "step": 228 + }, + { + "epoch": 0.23316582914572864, + "grad_norm": 0.388671875, + "learning_rate": 0.00029655428777543074, + "loss": 0.5997, + "step": 232 + }, + { + "epoch": 0.2371859296482412, + "grad_norm": 0.357421875, + "learning_rate": 0.0002964170240390023, + "loss": 0.6214, + "step": 236 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.375, + "learning_rate": 0.00029627711243855224, + "loss": 0.6562, + "step": 240 + }, + { + "epoch": 0.24522613065326634, + "grad_norm": 0.384765625, + "learning_rate": 0.000296134555504298, + "loss": 0.6193, + "step": 244 + }, + { + "epoch": 0.2492462311557789, + "grad_norm": 0.37890625, + "learning_rate": 0.000295989355814296, + "loss": 0.6486, + "step": 248 + }, + { + "epoch": 0.25326633165829143, + "grad_norm": 0.373046875, + "learning_rate": 0.0002958415159943958, + "loss": 0.6516, + "step": 252 + }, + { + "epoch": 0.25728643216080405, + "grad_norm": 0.380859375, + "learning_rate": 0.0002956910387181916, + "loss": 0.6304, + "step": 256 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.369140625, + "learning_rate": 0.0002955379267069747, + "loss": 0.6423, + "step": 260 + }, + { + "epoch": 0.26532663316582916, + "grad_norm": 0.369140625, + "learning_rate": 0.00029538218272968394, + "loss": 0.6231, + "step": 264 + }, + { + "epoch": 0.2693467336683417, + "grad_norm": 0.400390625, + "learning_rate": 0.00029522380960285573, + "loss": 0.5963, + "step": 268 + }, + { + "epoch": 0.2733668341708543, + "grad_norm": 0.388671875, + "learning_rate": 0.000295062810190573, + "loss": 0.6504, + "step": 272 + }, + { + "epoch": 0.27738693467336684, + "grad_norm": 0.369140625, + "learning_rate": 0.0002948991874044136, + "loss": 0.6141, + "step": 276 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.37109375, + "learning_rate": 0.0002947329442033973, + "loss": 0.6604, + "step": 280 + }, + { + "epoch": 0.28542713567839195, + "grad_norm": 0.369140625, + "learning_rate": 0.00029456408359393275, + "loss": 0.6366, + "step": 284 + }, + { + "epoch": 0.2894472361809045, + "grad_norm": 0.3828125, + "learning_rate": 0.0002943926086297627, + "loss": 0.6538, + "step": 288 + }, + { + "epoch": 0.29346733668341707, + "grad_norm": 0.37109375, + "learning_rate": 0.000294218522411909, + "loss": 0.6312, + "step": 292 + }, + { + "epoch": 0.2974874371859296, + "grad_norm": 0.361328125, + "learning_rate": 0.0002940418280886163, + "loss": 0.6618, + "step": 296 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.37890625, + "learning_rate": 0.0002938625288552957, + "loss": 0.6176, + "step": 300 + }, + { + "epoch": 0.3055276381909548, + "grad_norm": 0.40625, + "learning_rate": 0.000293680627954466, + "loss": 0.626, + "step": 304 + }, + { + "epoch": 0.30954773869346736, + "grad_norm": 0.373046875, + "learning_rate": 0.0002934961286756959, + "loss": 0.6636, + "step": 308 + }, + { + "epoch": 0.3135678391959799, + "grad_norm": 0.361328125, + "learning_rate": 0.0002933090343555442, + "loss": 0.6565, + "step": 312 + }, + { + "epoch": 0.31758793969849247, + "grad_norm": 0.40625, + "learning_rate": 0.0002931193483774993, + "loss": 0.6049, + "step": 316 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.39453125, + "learning_rate": 0.00029292707417191845, + "loss": 0.6412, + "step": 320 + }, + { + "epoch": 0.3256281407035176, + "grad_norm": 0.37890625, + "learning_rate": 0.0002927322152159652, + "loss": 0.6503, + "step": 324 + }, + { + "epoch": 0.32964824120603015, + "grad_norm": 0.36328125, + "learning_rate": 0.00029253477503354684, + "loss": 0.6133, + "step": 328 + }, + { + "epoch": 0.3336683417085427, + "grad_norm": 0.357421875, + "learning_rate": 0.0002923347571952506, + "loss": 0.6265, + "step": 332 + }, + { + "epoch": 0.33768844221105526, + "grad_norm": 0.365234375, + "learning_rate": 0.00029213216531827905, + "loss": 0.6366, + "step": 336 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.3515625, + "learning_rate": 0.00029192700306638475, + "loss": 0.6065, + "step": 340 + }, + { + "epoch": 0.3457286432160804, + "grad_norm": 0.392578125, + "learning_rate": 0.0002917192741498039, + "loss": 0.6875, + "step": 344 + }, + { + "epoch": 0.349748743718593, + "grad_norm": 0.36328125, + "learning_rate": 0.0002915089823251893, + "loss": 0.6538, + "step": 348 + }, + { + "epoch": 0.35376884422110555, + "grad_norm": 0.3515625, + "learning_rate": 0.00029129613139554237, + "loss": 0.637, + "step": 352 + }, + { + "epoch": 0.3577889447236181, + "grad_norm": 0.390625, + "learning_rate": 0.0002910807252101446, + "loss": 0.6136, + "step": 356 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.404296875, + "learning_rate": 0.0002908627676644874, + "loss": 0.6581, + "step": 360 + }, + { + "epoch": 0.3658291457286432, + "grad_norm": 0.345703125, + "learning_rate": 0.00029064226270020233, + "loss": 0.6421, + "step": 364 + }, + { + "epoch": 0.3698492462311558, + "grad_norm": 0.34765625, + "learning_rate": 0.0002904192143049893, + "loss": 0.5957, + "step": 368 + }, + { + "epoch": 0.37386934673366834, + "grad_norm": 0.37109375, + "learning_rate": 0.0002901936265125448, + "loss": 0.6291, + "step": 372 + }, + { + "epoch": 0.3778894472361809, + "grad_norm": 0.39453125, + "learning_rate": 0.0002899655034024885, + "loss": 0.6493, + "step": 376 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.357421875, + "learning_rate": 0.0002897348491002901, + "loss": 0.597, + "step": 380 + }, + { + "epoch": 0.385929648241206, + "grad_norm": 0.388671875, + "learning_rate": 0.0002895016677771942, + "loss": 0.6323, + "step": 384 + }, + { + "epoch": 0.38994974874371857, + "grad_norm": 0.404296875, + "learning_rate": 0.0002892659636501452, + "loss": 0.6468, + "step": 388 + }, + { + "epoch": 0.39396984924623113, + "grad_norm": 0.35546875, + "learning_rate": 0.0002890277409817107, + "loss": 0.6733, + "step": 392 + }, + { + "epoch": 0.39798994974874374, + "grad_norm": 0.34765625, + "learning_rate": 0.00028878700408000466, + "loss": 0.6245, + "step": 396 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.357421875, + "learning_rate": 0.0002885437572986096, + "loss": 0.6804, + "step": 400 + }, + { + "epoch": 0.40603015075376886, + "grad_norm": 0.412109375, + "learning_rate": 0.0002882980050364976, + "loss": 0.6546, + "step": 404 + }, + { + "epoch": 0.4100502512562814, + "grad_norm": 0.365234375, + "learning_rate": 0.0002880497517379508, + "loss": 0.6428, + "step": 408 + }, + { + "epoch": 0.414070351758794, + "grad_norm": 0.353515625, + "learning_rate": 0.00028779900189248117, + "loss": 0.6349, + "step": 412 + }, + { + "epoch": 0.41809045226130653, + "grad_norm": 0.400390625, + "learning_rate": 0.0002875457600347492, + "loss": 0.6171, + "step": 416 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.380859375, + "learning_rate": 0.00028729003074448193, + "loss": 0.6148, + "step": 420 + }, + { + "epoch": 0.42613065326633165, + "grad_norm": 0.3671875, + "learning_rate": 0.0002870318186463901, + "loss": 0.6469, + "step": 424 + }, + { + "epoch": 0.4301507537688442, + "grad_norm": 0.3515625, + "learning_rate": 0.0002867711284100846, + "loss": 0.5852, + "step": 428 + }, + { + "epoch": 0.43417085427135677, + "grad_norm": 0.357421875, + "learning_rate": 0.0002865079647499919, + "loss": 0.6327, + "step": 432 + }, + { + "epoch": 0.4381909547738693, + "grad_norm": 0.34765625, + "learning_rate": 0.00028624233242526887, + "loss": 0.652, + "step": 436 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.361328125, + "learning_rate": 0.00028597423623971674, + "loss": 0.6432, + "step": 440 + }, + { + "epoch": 0.4462311557788945, + "grad_norm": 0.353515625, + "learning_rate": 0.00028570368104169407, + "loss": 0.6091, + "step": 444 + }, + { + "epoch": 0.45025125628140705, + "grad_norm": 0.359375, + "learning_rate": 0.0002854306717240294, + "loss": 0.6278, + "step": 448 + }, + { + "epoch": 0.4542713567839196, + "grad_norm": 0.34375, + "learning_rate": 0.00028515521322393237, + "loss": 0.5679, + "step": 452 + }, + { + "epoch": 0.45829145728643217, + "grad_norm": 0.380859375, + "learning_rate": 0.0002848773105229046, + "loss": 0.6386, + "step": 456 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.36328125, + "learning_rate": 0.0002845969686466498, + "loss": 0.605, + "step": 460 + }, + { + "epoch": 0.4663316582914573, + "grad_norm": 0.375, + "learning_rate": 0.0002843141926649824, + "loss": 0.5858, + "step": 464 + }, + { + "epoch": 0.47035175879396984, + "grad_norm": 0.3671875, + "learning_rate": 0.00028402898769173653, + "loss": 0.6038, + "step": 468 + }, + { + "epoch": 0.4743718592964824, + "grad_norm": 0.361328125, + "learning_rate": 0.00028374135888467296, + "loss": 0.6276, + "step": 472 + }, + { + "epoch": 0.47839195979899496, + "grad_norm": 0.33984375, + "learning_rate": 0.00028345131144538597, + "loss": 0.6003, + "step": 476 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.390625, + "learning_rate": 0.00028315885061920955, + "loss": 0.6336, + "step": 480 + }, + { + "epoch": 0.4864321608040201, + "grad_norm": 0.361328125, + "learning_rate": 0.0002828639816951222, + "loss": 0.6344, + "step": 484 + }, + { + "epoch": 0.4904522613065327, + "grad_norm": 0.35546875, + "learning_rate": 0.0002825667100056515, + "loss": 0.6498, + "step": 488 + }, + { + "epoch": 0.49447236180904525, + "grad_norm": 0.328125, + "learning_rate": 0.0002822670409267776, + "loss": 0.6236, + "step": 492 + }, + { + "epoch": 0.4984924623115578, + "grad_norm": 0.39453125, + "learning_rate": 0.0002819649798778359, + "loss": 0.5804, + "step": 496 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.35546875, + "learning_rate": 0.0002816605323214193, + "loss": 0.5991, + "step": 500 + }, + { + "epoch": 0.5065326633165829, + "grad_norm": 0.34765625, + "learning_rate": 0.0002813537037632791, + "loss": 0.5647, + "step": 504 + }, + { + "epoch": 0.5105527638190954, + "grad_norm": 0.390625, + "learning_rate": 0.0002810444997522257, + "loss": 0.6434, + "step": 508 + }, + { + "epoch": 0.5145728643216081, + "grad_norm": 0.357421875, + "learning_rate": 0.0002807329258800281, + "loss": 0.6258, + "step": 512 + }, + { + "epoch": 0.5185929648241207, + "grad_norm": 0.349609375, + "learning_rate": 0.0002804189877813128, + "loss": 0.6212, + "step": 516 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.376953125, + "learning_rate": 0.000280102691133462, + "loss": 0.6603, + "step": 520 + }, + { + "epoch": 0.5266331658291458, + "grad_norm": 0.38671875, + "learning_rate": 0.00027978404165651064, + "loss": 0.6936, + "step": 524 + }, + { + "epoch": 0.5306532663316583, + "grad_norm": 0.36328125, + "learning_rate": 0.00027946304511304343, + "loss": 0.6549, + "step": 528 + }, + { + "epoch": 0.5346733668341709, + "grad_norm": 0.375, + "learning_rate": 0.0002791397073080902, + "loss": 0.6177, + "step": 532 + }, + { + "epoch": 0.5386934673366834, + "grad_norm": 0.37109375, + "learning_rate": 0.00027881403408902116, + "loss": 0.6376, + "step": 536 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.37109375, + "learning_rate": 0.00027848603134544104, + "loss": 0.6017, + "step": 540 + }, + { + "epoch": 0.5467336683417086, + "grad_norm": 0.369140625, + "learning_rate": 0.00027815570500908256, + "loss": 0.6401, + "step": 544 + }, + { + "epoch": 0.5507537688442211, + "grad_norm": 0.384765625, + "learning_rate": 0.00027782306105369944, + "loss": 0.6471, + "step": 548 + }, + { + "epoch": 0.5547738693467337, + "grad_norm": 0.375, + "learning_rate": 0.0002774881054949579, + "loss": 0.6027, + "step": 552 + }, + { + "epoch": 0.5587939698492462, + "grad_norm": 0.376953125, + "learning_rate": 0.00027715084439032826, + "loss": 0.6497, + "step": 556 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.345703125, + "learning_rate": 0.00027681128383897524, + "loss": 0.6139, + "step": 560 + }, + { + "epoch": 0.5668341708542713, + "grad_norm": 0.365234375, + "learning_rate": 0.0002764694299816477, + "loss": 0.6408, + "step": 564 + }, + { + "epoch": 0.5708542713567839, + "grad_norm": 0.365234375, + "learning_rate": 0.0002761252890005674, + "loss": 0.6232, + "step": 568 + }, + { + "epoch": 0.5748743718592965, + "grad_norm": 0.357421875, + "learning_rate": 0.0002757788671193176, + "loss": 0.6059, + "step": 572 + }, + { + "epoch": 0.578894472361809, + "grad_norm": 0.39453125, + "learning_rate": 0.00027543017060273003, + "loss": 0.5653, + "step": 576 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.365234375, + "learning_rate": 0.0002750792057567721, + "loss": 0.6294, + "step": 580 + }, + { + "epoch": 0.5869346733668341, + "grad_norm": 0.326171875, + "learning_rate": 0.00027472597892843226, + "loss": 0.6528, + "step": 584 + }, + { + "epoch": 0.5909547738693467, + "grad_norm": 0.349609375, + "learning_rate": 0.00027437049650560596, + "loss": 0.6278, + "step": 588 + }, + { + "epoch": 0.5949748743718593, + "grad_norm": 0.365234375, + "learning_rate": 0.00027401276491697933, + "loss": 0.641, + "step": 592 + }, + { + "epoch": 0.5989949748743718, + "grad_norm": 0.36328125, + "learning_rate": 0.0002736527906319136, + "loss": 0.6252, + "step": 596 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.3515625, + "learning_rate": 0.0002732905801603277, + "loss": 0.6359, + "step": 600 + }, + { + "epoch": 0.607035175879397, + "grad_norm": 0.33203125, + "learning_rate": 0.0002729261400525806, + "loss": 0.6323, + "step": 604 + }, + { + "epoch": 0.6110552763819096, + "grad_norm": 0.373046875, + "learning_rate": 0.000272559476899353, + "loss": 0.6002, + "step": 608 + }, + { + "epoch": 0.6150753768844222, + "grad_norm": 0.365234375, + "learning_rate": 0.00027219059733152805, + "loss": 0.6447, + "step": 612 + }, + { + "epoch": 0.6190954773869347, + "grad_norm": 0.392578125, + "learning_rate": 0.00027181950802007134, + "loss": 0.6009, + "step": 616 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.359375, + "learning_rate": 0.0002714462156759104, + "loss": 0.6515, + "step": 620 + }, + { + "epoch": 0.6271356783919598, + "grad_norm": 0.375, + "learning_rate": 0.00027107072704981325, + "loss": 0.5963, + "step": 624 + }, + { + "epoch": 0.6311557788944724, + "grad_norm": 0.333984375, + "learning_rate": 0.00027069304893226646, + "loss": 0.6148, + "step": 628 + }, + { + "epoch": 0.6351758793969849, + "grad_norm": 0.36328125, + "learning_rate": 0.0002703131881533521, + "loss": 0.6235, + "step": 632 + }, + { + "epoch": 0.6391959798994975, + "grad_norm": 0.38671875, + "learning_rate": 0.00026993115158262444, + "loss": 0.6813, + "step": 636 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.34765625, + "learning_rate": 0.0002695469461289856, + "loss": 0.6219, + "step": 640 + }, + { + "epoch": 0.6472361809045226, + "grad_norm": 0.341796875, + "learning_rate": 0.00026916057874056063, + "loss": 0.6044, + "step": 644 + }, + { + "epoch": 0.6512562814070352, + "grad_norm": 0.384765625, + "learning_rate": 0.00026877205640457195, + "loss": 0.5938, + "step": 648 + }, + { + "epoch": 0.6552763819095477, + "grad_norm": 0.33984375, + "learning_rate": 0.00026838138614721294, + "loss": 0.6097, + "step": 652 + }, + { + "epoch": 0.6592964824120603, + "grad_norm": 0.34765625, + "learning_rate": 0.0002679885750335207, + "loss": 0.6163, + "step": 656 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.3359375, + "learning_rate": 0.0002675936301672485, + "loss": 0.6465, + "step": 660 + }, + { + "epoch": 0.6673366834170854, + "grad_norm": 0.349609375, + "learning_rate": 0.0002671965586907373, + "loss": 0.6238, + "step": 664 + }, + { + "epoch": 0.671356783919598, + "grad_norm": 0.376953125, + "learning_rate": 0.0002667973677847865, + "loss": 0.6, + "step": 668 + }, + { + "epoch": 0.6753768844221105, + "grad_norm": 0.37109375, + "learning_rate": 0.000266396064668524, + "loss": 0.6435, + "step": 672 + }, + { + "epoch": 0.6793969849246231, + "grad_norm": 0.3828125, + "learning_rate": 0.00026599265659927603, + "loss": 0.6182, + "step": 676 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.37109375, + "learning_rate": 0.0002655871508724353, + "loss": 0.6139, + "step": 680 + }, + { + "epoch": 0.6874371859296482, + "grad_norm": 0.33984375, + "learning_rate": 0.00026517955482132955, + "loss": 0.6341, + "step": 684 + }, + { + "epoch": 0.6914572864321608, + "grad_norm": 0.37109375, + "learning_rate": 0.0002647698758170889, + "loss": 0.6273, + "step": 688 + }, + { + "epoch": 0.6954773869346733, + "grad_norm": 0.3828125, + "learning_rate": 0.00026435812126851223, + "loss": 0.6282, + "step": 692 + }, + { + "epoch": 0.699497487437186, + "grad_norm": 0.349609375, + "learning_rate": 0.0002639442986219335, + "loss": 0.6421, + "step": 696 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.353515625, + "learning_rate": 0.000263528415361087, + "loss": 0.6504, + "step": 700 + }, + { + "epoch": 0.7075376884422111, + "grad_norm": 0.34765625, + "learning_rate": 0.0002631104790069719, + "loss": 0.615, + "step": 704 + }, + { + "epoch": 0.7115577889447237, + "grad_norm": 0.345703125, + "learning_rate": 0.00026269049711771634, + "loss": 0.6939, + "step": 708 + }, + { + "epoch": 0.7155778894472362, + "grad_norm": 0.326171875, + "learning_rate": 0.00026226847728844083, + "loss": 0.5338, + "step": 712 + }, + { + "epoch": 0.7195979899497488, + "grad_norm": 0.380859375, + "learning_rate": 0.00026184442715112074, + "loss": 0.6489, + "step": 716 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.3515625, + "learning_rate": 0.0002614183543744484, + "loss": 0.6212, + "step": 720 + }, + { + "epoch": 0.7276381909547739, + "grad_norm": 0.337890625, + "learning_rate": 0.0002609902666636942, + "loss": 0.607, + "step": 724 + }, + { + "epoch": 0.7316582914572864, + "grad_norm": 0.3359375, + "learning_rate": 0.0002605601717605676, + "loss": 0.5979, + "step": 728 + }, + { + "epoch": 0.735678391959799, + "grad_norm": 0.353515625, + "learning_rate": 0.0002601280774430768, + "loss": 0.6397, + "step": 732 + }, + { + "epoch": 0.7396984924623116, + "grad_norm": 0.369140625, + "learning_rate": 0.00025969399152538824, + "loss": 0.6519, + "step": 736 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.353515625, + "learning_rate": 0.0002592579218576853, + "loss": 0.6131, + "step": 740 + }, + { + "epoch": 0.7477386934673367, + "grad_norm": 0.357421875, + "learning_rate": 0.00025881987632602626, + "loss": 0.6793, + "step": 744 + }, + { + "epoch": 0.7517587939698492, + "grad_norm": 0.3984375, + "learning_rate": 0.00025837986285220173, + "loss": 0.6477, + "step": 748 + }, + { + "epoch": 0.7557788944723618, + "grad_norm": 0.326171875, + "learning_rate": 0.0002579378893935913, + "loss": 0.6347, + "step": 752 + }, + { + "epoch": 0.7597989949748744, + "grad_norm": 0.328125, + "learning_rate": 0.0002574939639430198, + "loss": 0.6188, + "step": 756 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.353515625, + "learning_rate": 0.00025704809452861254, + "loss": 0.6099, + "step": 760 + }, + { + "epoch": 0.7678391959798995, + "grad_norm": 0.357421875, + "learning_rate": 0.0002566002892136505, + "loss": 0.6358, + "step": 764 + }, + { + "epoch": 0.771859296482412, + "grad_norm": 0.3515625, + "learning_rate": 0.00025615055609642387, + "loss": 0.6048, + "step": 768 + }, + { + "epoch": 0.7758793969849246, + "grad_norm": 0.349609375, + "learning_rate": 0.0002556989033100864, + "loss": 0.653, + "step": 772 + }, + { + "epoch": 0.7798994974874371, + "grad_norm": 0.375, + "learning_rate": 0.0002552453390225076, + "loss": 0.6311, + "step": 776 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.359375, + "learning_rate": 0.0002547898714361255, + "loss": 0.6403, + "step": 780 + }, + { + "epoch": 0.7879396984924623, + "grad_norm": 0.359375, + "learning_rate": 0.0002543325087877981, + "loss": 0.6325, + "step": 784 + }, + { + "epoch": 0.7919597989949749, + "grad_norm": 0.38671875, + "learning_rate": 0.0002538732593486545, + "loss": 0.6364, + "step": 788 + }, + { + "epoch": 0.7959798994974875, + "grad_norm": 0.353515625, + "learning_rate": 0.00025341213142394514, + "loss": 0.643, + "step": 792 + }, + { + "epoch": 0.8, + "grad_norm": 0.359375, + "learning_rate": 0.0002529491333528918, + "loss": 0.6523, + "step": 796 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.337890625, + "learning_rate": 0.00025248427350853687, + "loss": 0.6685, + "step": 800 + }, + { + "epoch": 0.8080402010050252, + "grad_norm": 0.37109375, + "learning_rate": 0.0002520175602975917, + "loss": 0.6533, + "step": 804 + }, + { + "epoch": 0.8120603015075377, + "grad_norm": 0.373046875, + "learning_rate": 0.00025154900216028465, + "loss": 0.5855, + "step": 808 + }, + { + "epoch": 0.8160804020100503, + "grad_norm": 0.3359375, + "learning_rate": 0.00025107860757020835, + "loss": 0.617, + "step": 812 + }, + { + "epoch": 0.8201005025125628, + "grad_norm": 0.388671875, + "learning_rate": 0.0002506063850341669, + "loss": 0.6098, + "step": 816 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.369140625, + "learning_rate": 0.00025013234309202134, + "loss": 0.631, + "step": 820 + }, + { + "epoch": 0.828140703517588, + "grad_norm": 0.337890625, + "learning_rate": 0.0002496564903165358, + "loss": 0.5969, + "step": 824 + }, + { + "epoch": 0.8321608040201005, + "grad_norm": 0.353515625, + "learning_rate": 0.0002491788353132222, + "loss": 0.6147, + "step": 828 + }, + { + "epoch": 0.8361809045226131, + "grad_norm": 0.36328125, + "learning_rate": 0.00024869938672018464, + "loss": 0.6408, + "step": 832 + }, + { + "epoch": 0.8402010050251256, + "grad_norm": 0.35546875, + "learning_rate": 0.00024821815320796327, + "loss": 0.6386, + "step": 836 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.33984375, + "learning_rate": 0.00024773514347937726, + "loss": 0.6396, + "step": 840 + }, + { + "epoch": 0.8482412060301507, + "grad_norm": 0.375, + "learning_rate": 0.0002472503662693679, + "loss": 0.6181, + "step": 844 + }, + { + "epoch": 0.8522613065326633, + "grad_norm": 0.3828125, + "learning_rate": 0.00024676383034484003, + "loss": 0.6569, + "step": 848 + }, + { + "epoch": 0.8562814070351759, + "grad_norm": 0.375, + "learning_rate": 0.00024627554450450394, + "loss": 0.6382, + "step": 852 + }, + { + "epoch": 0.8603015075376884, + "grad_norm": 0.37109375, + "learning_rate": 0.0002457855175787161, + "loss": 0.6108, + "step": 856 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.37109375, + "learning_rate": 0.00024529375842931924, + "loss": 0.6555, + "step": 860 + }, + { + "epoch": 0.8683417085427135, + "grad_norm": 0.345703125, + "learning_rate": 0.00024480027594948265, + "loss": 0.5887, + "step": 864 + }, + { + "epoch": 0.8723618090452261, + "grad_norm": 0.349609375, + "learning_rate": 0.0002443050790635408, + "loss": 0.6581, + "step": 868 + }, + { + "epoch": 0.8763819095477386, + "grad_norm": 0.36328125, + "learning_rate": 0.00024380817672683234, + "loss": 0.6448, + "step": 872 + }, + { + "epoch": 0.8804020100502512, + "grad_norm": 0.353515625, + "learning_rate": 0.0002433095779255377, + "loss": 0.6277, + "step": 876 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.337890625, + "learning_rate": 0.0002428092916765171, + "loss": 0.5841, + "step": 880 + }, + { + "epoch": 0.8884422110552764, + "grad_norm": 0.375, + "learning_rate": 0.00024230732702714718, + "loss": 0.619, + "step": 884 + }, + { + "epoch": 0.892462311557789, + "grad_norm": 0.353515625, + "learning_rate": 0.00024180369305515733, + "loss": 0.5848, + "step": 888 + }, + { + "epoch": 0.8964824120603015, + "grad_norm": 0.380859375, + "learning_rate": 0.00024129839886846582, + "loss": 0.625, + "step": 892 + }, + { + "epoch": 0.9005025125628141, + "grad_norm": 0.341796875, + "learning_rate": 0.00024079145360501473, + "loss": 0.5922, + "step": 896 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.36328125, + "learning_rate": 0.00024028286643260503, + "loss": 0.6346, + "step": 900 + }, + { + "epoch": 0.9085427135678392, + "grad_norm": 0.365234375, + "learning_rate": 0.00023977264654873048, + "loss": 0.6594, + "step": 904 + }, + { + "epoch": 0.9125628140703518, + "grad_norm": 0.357421875, + "learning_rate": 0.0002392608031804116, + "loss": 0.6271, + "step": 908 + }, + { + "epoch": 0.9165829145728643, + "grad_norm": 0.3671875, + "learning_rate": 0.0002387473455840285, + "loss": 0.6102, + "step": 912 + }, + { + "epoch": 0.9206030150753769, + "grad_norm": 0.361328125, + "learning_rate": 0.00023823228304515373, + "loss": 0.6448, + "step": 916 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.345703125, + "learning_rate": 0.00023771562487838425, + "loss": 0.5876, + "step": 920 + }, + { + "epoch": 0.928643216080402, + "grad_norm": 0.345703125, + "learning_rate": 0.00023719738042717297, + "loss": 0.6417, + "step": 924 + }, + { + "epoch": 0.9326633165829146, + "grad_norm": 0.349609375, + "learning_rate": 0.00023667755906365984, + "loss": 0.6126, + "step": 928 + }, + { + "epoch": 0.9366834170854271, + "grad_norm": 0.333984375, + "learning_rate": 0.00023615617018850232, + "loss": 0.5917, + "step": 932 + }, + { + "epoch": 0.9407035175879397, + "grad_norm": 0.40234375, + "learning_rate": 0.00023563322323070528, + "loss": 0.627, + "step": 936 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.390625, + "learning_rate": 0.0002351087276474507, + "loss": 0.6283, + "step": 940 + }, + { + "epoch": 0.9487437185929648, + "grad_norm": 0.36328125, + "learning_rate": 0.0002345826929239265, + "loss": 0.6519, + "step": 944 + }, + { + "epoch": 0.9527638190954774, + "grad_norm": 0.337890625, + "learning_rate": 0.00023405512857315494, + "loss": 0.5933, + "step": 948 + }, + { + "epoch": 0.9567839195979899, + "grad_norm": 0.35546875, + "learning_rate": 0.00023352604413582074, + "loss": 0.604, + "step": 952 + }, + { + "epoch": 0.9608040201005025, + "grad_norm": 0.341796875, + "learning_rate": 0.00023299544918009858, + "loss": 0.6275, + "step": 956 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.34765625, + "learning_rate": 0.0002324633533014797, + "loss": 0.61, + "step": 960 + }, + { + "epoch": 0.9688442211055276, + "grad_norm": 0.337890625, + "learning_rate": 0.0002319297661225989, + "loss": 0.6548, + "step": 964 + }, + { + "epoch": 0.9728643216080402, + "grad_norm": 0.3828125, + "learning_rate": 0.00023139469729306007, + "loss": 0.6309, + "step": 968 + }, + { + "epoch": 0.9768844221105528, + "grad_norm": 0.33203125, + "learning_rate": 0.00023085815648926194, + "loss": 0.5972, + "step": 972 + }, + { + "epoch": 0.9809045226130654, + "grad_norm": 0.341796875, + "learning_rate": 0.00023032015341422295, + "loss": 0.6526, + "step": 976 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.3515625, + "learning_rate": 0.00022978069779740597, + "loss": 0.6492, + "step": 980 + }, + { + "epoch": 0.9889447236180905, + "grad_norm": 0.353515625, + "learning_rate": 0.00022923979939454202, + "loss": 0.6393, + "step": 984 + }, + { + "epoch": 0.992964824120603, + "grad_norm": 0.361328125, + "learning_rate": 0.00022869746798745425, + "loss": 0.5946, + "step": 988 + }, + { + "epoch": 0.9969849246231156, + "grad_norm": 0.357421875, + "learning_rate": 0.00022815371338388062, + "loss": 0.6341, + "step": 992 + }, + { + "epoch": 1.0010050251256282, + "grad_norm": 0.333984375, + "learning_rate": 0.00022760854541729693, + "loss": 0.5532, + "step": 996 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.341796875, + "learning_rate": 0.00022706197394673874, + "loss": 0.5303, + "step": 1000 + }, + { + "epoch": 1.0090452261306533, + "grad_norm": 0.345703125, + "learning_rate": 0.0002265140088566231, + "loss": 0.5229, + "step": 1004 + }, + { + "epoch": 1.0130653266331657, + "grad_norm": 0.36328125, + "learning_rate": 0.00022596466005656983, + "loss": 0.528, + "step": 1008 + }, + { + "epoch": 1.0170854271356784, + "grad_norm": 0.33984375, + "learning_rate": 0.00022541393748122234, + "loss": 0.4928, + "step": 1012 + }, + { + "epoch": 1.0211055276381908, + "grad_norm": 0.345703125, + "learning_rate": 0.00022486185109006797, + "loss": 0.5461, + "step": 1016 + }, + { + "epoch": 1.0251256281407035, + "grad_norm": 0.341796875, + "learning_rate": 0.0002243084108672578, + "loss": 0.5323, + "step": 1020 + }, + { + "epoch": 1.0291457286432162, + "grad_norm": 0.37109375, + "learning_rate": 0.00022375362682142618, + "loss": 0.5196, + "step": 1024 + }, + { + "epoch": 1.0331658291457286, + "grad_norm": 0.33203125, + "learning_rate": 0.00022319750898550962, + "loss": 0.5131, + "step": 1028 + }, + { + "epoch": 1.0371859296482413, + "grad_norm": 0.326171875, + "learning_rate": 0.0002226400674165656, + "loss": 0.5072, + "step": 1032 + }, + { + "epoch": 1.0412060301507537, + "grad_norm": 0.34765625, + "learning_rate": 0.00022208131219559032, + "loss": 0.4853, + "step": 1036 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.34375, + "learning_rate": 0.00022152125342733673, + "loss": 0.505, + "step": 1040 + }, + { + "epoch": 1.0492462311557789, + "grad_norm": 0.3515625, + "learning_rate": 0.00022095990124013147, + "loss": 0.5292, + "step": 1044 + }, + { + "epoch": 1.0532663316582915, + "grad_norm": 0.369140625, + "learning_rate": 0.00022039726578569212, + "loss": 0.5244, + "step": 1048 + }, + { + "epoch": 1.057286432160804, + "grad_norm": 0.357421875, + "learning_rate": 0.0002198333572389432, + "loss": 0.4877, + "step": 1052 + }, + { + "epoch": 1.0613065326633166, + "grad_norm": 0.36328125, + "learning_rate": 0.0002192681857978324, + "loss": 0.5056, + "step": 1056 + }, + { + "epoch": 1.065326633165829, + "grad_norm": 0.349609375, + "learning_rate": 0.0002187017616831461, + "loss": 0.5349, + "step": 1060 + }, + { + "epoch": 1.0693467336683418, + "grad_norm": 0.333984375, + "learning_rate": 0.00021813409513832464, + "loss": 0.4781, + "step": 1064 + }, + { + "epoch": 1.0733668341708542, + "grad_norm": 0.353515625, + "learning_rate": 0.00021756519642927665, + "loss": 0.523, + "step": 1068 + }, + { + "epoch": 1.0773869346733669, + "grad_norm": 0.3671875, + "learning_rate": 0.0002169950758441941, + "loss": 0.5209, + "step": 1072 + }, + { + "epoch": 1.0814070351758793, + "grad_norm": 0.37109375, + "learning_rate": 0.00021642374369336558, + "loss": 0.52, + "step": 1076 + }, + { + "epoch": 1.085427135678392, + "grad_norm": 0.359375, + "learning_rate": 0.00021585121030899014, + "loss": 0.5171, + "step": 1080 + }, + { + "epoch": 1.0894472361809044, + "grad_norm": 0.38671875, + "learning_rate": 0.00021527748604499062, + "loss": 0.525, + "step": 1084 + }, + { + "epoch": 1.0934673366834171, + "grad_norm": 0.357421875, + "learning_rate": 0.000214702581276826, + "loss": 0.5494, + "step": 1088 + }, + { + "epoch": 1.0974874371859296, + "grad_norm": 0.36328125, + "learning_rate": 0.00021412650640130409, + "loss": 0.5679, + "step": 1092 + }, + { + "epoch": 1.1015075376884422, + "grad_norm": 0.359375, + "learning_rate": 0.00021354927183639326, + "loss": 0.5263, + "step": 1096 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.3671875, + "learning_rate": 0.00021297088802103427, + "loss": 0.5461, + "step": 1100 + }, + { + "epoch": 1.1095477386934673, + "grad_norm": 0.37109375, + "learning_rate": 0.00021239136541495137, + "loss": 0.4837, + "step": 1104 + }, + { + "epoch": 1.11356783919598, + "grad_norm": 0.416015625, + "learning_rate": 0.0002118107144984632, + "loss": 0.5085, + "step": 1108 + }, + { + "epoch": 1.1175879396984925, + "grad_norm": 0.392578125, + "learning_rate": 0.00021122894577229307, + "loss": 0.5327, + "step": 1112 + }, + { + "epoch": 1.121608040201005, + "grad_norm": 0.345703125, + "learning_rate": 0.00021064606975737933, + "loss": 0.5378, + "step": 1116 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.375, + "learning_rate": 0.000210062096994685, + "loss": 0.5782, + "step": 1120 + }, + { + "epoch": 1.1296482412060302, + "grad_norm": 0.361328125, + "learning_rate": 0.000209477038045007, + "loss": 0.5379, + "step": 1124 + }, + { + "epoch": 1.1336683417085427, + "grad_norm": 0.365234375, + "learning_rate": 0.0002088909034887854, + "loss": 0.5295, + "step": 1128 + }, + { + "epoch": 1.1376884422110554, + "grad_norm": 0.3515625, + "learning_rate": 0.00020830370392591201, + "loss": 0.4973, + "step": 1132 + }, + { + "epoch": 1.1417085427135678, + "grad_norm": 0.36328125, + "learning_rate": 0.0002077154499755384, + "loss": 0.5354, + "step": 1136 + }, + { + "epoch": 1.1457286432160805, + "grad_norm": 0.361328125, + "learning_rate": 0.00020712615227588447, + "loss": 0.553, + "step": 1140 + }, + { + "epoch": 1.149748743718593, + "grad_norm": 0.36328125, + "learning_rate": 0.00020653582148404538, + "loss": 0.4842, + "step": 1144 + }, + { + "epoch": 1.1537688442211056, + "grad_norm": 0.369140625, + "learning_rate": 0.00020594446827579935, + "loss": 0.5194, + "step": 1148 + }, + { + "epoch": 1.157788944723618, + "grad_norm": 0.333984375, + "learning_rate": 0.0002053521033454142, + "loss": 0.5314, + "step": 1152 + }, + { + "epoch": 1.1618090452261307, + "grad_norm": 0.359375, + "learning_rate": 0.00020475873740545444, + "loss": 0.4998, + "step": 1156 + }, + { + "epoch": 1.1658291457286432, + "grad_norm": 0.375, + "learning_rate": 0.0002041643811865868, + "loss": 0.5061, + "step": 1160 + }, + { + "epoch": 1.1698492462311558, + "grad_norm": 0.35546875, + "learning_rate": 0.000203569045437387, + "loss": 0.5047, + "step": 1164 + }, + { + "epoch": 1.1738693467336683, + "grad_norm": 0.375, + "learning_rate": 0.00020297274092414484, + "loss": 0.5115, + "step": 1168 + }, + { + "epoch": 1.177889447236181, + "grad_norm": 0.34375, + "learning_rate": 0.0002023754784306695, + "loss": 0.5287, + "step": 1172 + }, + { + "epoch": 1.1819095477386934, + "grad_norm": 0.349609375, + "learning_rate": 0.00020177726875809498, + "loss": 0.553, + "step": 1176 + }, + { + "epoch": 1.185929648241206, + "grad_norm": 0.353515625, + "learning_rate": 0.00020117812272468408, + "loss": 0.5212, + "step": 1180 + }, + { + "epoch": 1.1899497487437185, + "grad_norm": 0.37109375, + "learning_rate": 0.0002005780511656333, + "loss": 0.5433, + "step": 1184 + }, + { + "epoch": 1.1939698492462312, + "grad_norm": 0.373046875, + "learning_rate": 0.00019997706493287686, + "loss": 0.5462, + "step": 1188 + }, + { + "epoch": 1.1979899497487438, + "grad_norm": 0.357421875, + "learning_rate": 0.00019937517489489008, + "loss": 0.5251, + "step": 1192 + }, + { + "epoch": 1.2020100502512563, + "grad_norm": 0.384765625, + "learning_rate": 0.00019877239193649303, + "loss": 0.522, + "step": 1196 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.37890625, + "learning_rate": 0.0001981687269586539, + "loss": 0.5207, + "step": 1200 + }, + { + "epoch": 1.2100502512562814, + "grad_norm": 0.34375, + "learning_rate": 0.00019756419087829161, + "loss": 0.5119, + "step": 1204 + }, + { + "epoch": 1.214070351758794, + "grad_norm": 0.345703125, + "learning_rate": 0.00019695879462807835, + "loss": 0.5216, + "step": 1208 + }, + { + "epoch": 1.2180904522613065, + "grad_norm": 0.365234375, + "learning_rate": 0.0001963525491562421, + "loss": 0.5006, + "step": 1212 + }, + { + "epoch": 1.2221105527638192, + "grad_norm": 0.380859375, + "learning_rate": 0.0001957454654263684, + "loss": 0.5248, + "step": 1216 + }, + { + "epoch": 1.2261306532663316, + "grad_norm": 0.3671875, + "learning_rate": 0.0001951375544172022, + "loss": 0.5105, + "step": 1220 + }, + { + "epoch": 1.2301507537688443, + "grad_norm": 0.3828125, + "learning_rate": 0.00019452882712244935, + "loss": 0.5096, + "step": 1224 + }, + { + "epoch": 1.2341708542713568, + "grad_norm": 0.380859375, + "learning_rate": 0.00019391929455057772, + "loss": 0.5608, + "step": 1228 + }, + { + "epoch": 1.2381909547738694, + "grad_norm": 0.37109375, + "learning_rate": 0.00019330896772461813, + "loss": 0.5667, + "step": 1232 + }, + { + "epoch": 1.2422110552763819, + "grad_norm": 0.38671875, + "learning_rate": 0.0001926978576819649, + "loss": 0.5181, + "step": 1236 + }, + { + "epoch": 1.2462311557788945, + "grad_norm": 0.392578125, + "learning_rate": 0.0001920859754741766, + "loss": 0.5102, + "step": 1240 + }, + { + "epoch": 1.250251256281407, + "grad_norm": 0.37890625, + "learning_rate": 0.0001914733321667757, + "loss": 0.5188, + "step": 1244 + }, + { + "epoch": 1.2542713567839197, + "grad_norm": 0.359375, + "learning_rate": 0.00019085993883904878, + "loss": 0.539, + "step": 1248 + }, + { + "epoch": 1.258291457286432, + "grad_norm": 0.375, + "learning_rate": 0.00019024580658384612, + "loss": 0.5216, + "step": 1252 + }, + { + "epoch": 1.2623115577889448, + "grad_norm": 0.396484375, + "learning_rate": 0.0001896309465073811, + "loss": 0.5534, + "step": 1256 + }, + { + "epoch": 1.2663316582914572, + "grad_norm": 0.3828125, + "learning_rate": 0.00018901536972902922, + "loss": 0.5503, + "step": 1260 + }, + { + "epoch": 1.2703517587939699, + "grad_norm": 0.369140625, + "learning_rate": 0.00018839908738112714, + "loss": 0.4965, + "step": 1264 + }, + { + "epoch": 1.2743718592964823, + "grad_norm": 0.349609375, + "learning_rate": 0.00018778211060877127, + "loss": 0.519, + "step": 1268 + }, + { + "epoch": 1.278391959798995, + "grad_norm": 0.3828125, + "learning_rate": 0.00018716445056961634, + "loss": 0.5164, + "step": 1272 + }, + { + "epoch": 1.2824120603015077, + "grad_norm": 0.384765625, + "learning_rate": 0.0001865461184336736, + "loss": 0.5148, + "step": 1276 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.341796875, + "learning_rate": 0.00018592712538310864, + "loss": 0.508, + "step": 1280 + }, + { + "epoch": 1.2904522613065326, + "grad_norm": 0.37890625, + "learning_rate": 0.00018530748261203934, + "loss": 0.5248, + "step": 1284 + }, + { + "epoch": 1.2944723618090452, + "grad_norm": 0.361328125, + "learning_rate": 0.00018468720132633337, + "loss": 0.5287, + "step": 1288 + }, + { + "epoch": 1.298492462311558, + "grad_norm": 0.39453125, + "learning_rate": 0.00018406629274340564, + "loss": 0.5527, + "step": 1292 + }, + { + "epoch": 1.3025125628140704, + "grad_norm": 0.357421875, + "learning_rate": 0.0001834447680920153, + "loss": 0.4996, + "step": 1296 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.3828125, + "learning_rate": 0.00018282263861206266, + "loss": 0.4831, + "step": 1300 + }, + { + "epoch": 1.3105527638190955, + "grad_norm": 0.35546875, + "learning_rate": 0.0001821999155543861, + "loss": 0.5409, + "step": 1304 + }, + { + "epoch": 1.3145728643216081, + "grad_norm": 0.33984375, + "learning_rate": 0.00018157661018055842, + "loss": 0.5095, + "step": 1308 + }, + { + "epoch": 1.3185929648241206, + "grad_norm": 0.353515625, + "learning_rate": 0.00018095273376268333, + "loss": 0.5683, + "step": 1312 + }, + { + "epoch": 1.322613065326633, + "grad_norm": 0.369140625, + "learning_rate": 0.00018032829758319146, + "loss": 0.4956, + "step": 1316 + }, + { + "epoch": 1.3266331658291457, + "grad_norm": 0.36328125, + "learning_rate": 0.00017970331293463643, + "loss": 0.5346, + "step": 1320 + }, + { + "epoch": 1.3306532663316584, + "grad_norm": 0.375, + "learning_rate": 0.00017907779111949054, + "loss": 0.5211, + "step": 1324 + }, + { + "epoch": 1.3346733668341708, + "grad_norm": 0.365234375, + "learning_rate": 0.0001784517434499405, + "loss": 0.5293, + "step": 1328 + }, + { + "epoch": 1.3386934673366835, + "grad_norm": 0.36328125, + "learning_rate": 0.00017782518124768282, + "loss": 0.5131, + "step": 1332 + }, + { + "epoch": 1.342713567839196, + "grad_norm": 0.3671875, + "learning_rate": 0.00017719811584371886, + "loss": 0.5012, + "step": 1336 + }, + { + "epoch": 1.3467336683417086, + "grad_norm": 0.419921875, + "learning_rate": 0.00017657055857815018, + "loss": 0.513, + "step": 1340 + }, + { + "epoch": 1.350753768844221, + "grad_norm": 0.3984375, + "learning_rate": 0.0001759425207999734, + "loss": 0.5691, + "step": 1344 + }, + { + "epoch": 1.3547738693467337, + "grad_norm": 0.376953125, + "learning_rate": 0.00017531401386687492, + "loss": 0.5173, + "step": 1348 + }, + { + "epoch": 1.3587939698492462, + "grad_norm": 0.390625, + "learning_rate": 0.00017468504914502542, + "loss": 0.4847, + "step": 1352 + }, + { + "epoch": 1.3628140703517588, + "grad_norm": 0.392578125, + "learning_rate": 0.0001740556380088745, + "loss": 0.5451, + "step": 1356 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.369140625, + "learning_rate": 0.0001734257918409449, + "loss": 0.5408, + "step": 1360 + }, + { + "epoch": 1.370854271356784, + "grad_norm": 0.33984375, + "learning_rate": 0.0001727955220316265, + "loss": 0.4905, + "step": 1364 + }, + { + "epoch": 1.3748743718592964, + "grad_norm": 0.36328125, + "learning_rate": 0.0001721648399789708, + "loss": 0.4882, + "step": 1368 + }, + { + "epoch": 1.378894472361809, + "grad_norm": 0.375, + "learning_rate": 0.00017153375708848422, + "loss": 0.5087, + "step": 1372 + }, + { + "epoch": 1.3829145728643217, + "grad_norm": 0.3828125, + "learning_rate": 0.00017090228477292202, + "loss": 0.5281, + "step": 1376 + }, + { + "epoch": 1.3869346733668342, + "grad_norm": 0.353515625, + "learning_rate": 0.00017027043445208225, + "loss": 0.5353, + "step": 1380 + }, + { + "epoch": 1.3909547738693466, + "grad_norm": 0.35546875, + "learning_rate": 0.0001696382175525988, + "loss": 0.5069, + "step": 1384 + }, + { + "epoch": 1.3949748743718593, + "grad_norm": 0.359375, + "learning_rate": 0.0001690056455077349, + "loss": 0.496, + "step": 1388 + }, + { + "epoch": 1.398994974874372, + "grad_norm": 0.359375, + "learning_rate": 0.00016837272975717642, + "loss": 0.5481, + "step": 1392 + }, + { + "epoch": 1.4030150753768844, + "grad_norm": 0.388671875, + "learning_rate": 0.0001677394817468249, + "loss": 0.5071, + "step": 1396 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3515625, + "learning_rate": 0.00016710591292859063, + "loss": 0.51, + "step": 1400 + }, + { + "epoch": 1.4110552763819095, + "grad_norm": 0.37109375, + "learning_rate": 0.0001664720347601855, + "loss": 0.5155, + "step": 1404 + }, + { + "epoch": 1.4150753768844222, + "grad_norm": 0.373046875, + "learning_rate": 0.00016583785870491588, + "loss": 0.5221, + "step": 1408 + }, + { + "epoch": 1.4190954773869346, + "grad_norm": 0.3671875, + "learning_rate": 0.00016520339623147517, + "loss": 0.542, + "step": 1412 + }, + { + "epoch": 1.4231155778894473, + "grad_norm": 0.34375, + "learning_rate": 0.0001645686588137365, + "loss": 0.4669, + "step": 1416 + }, + { + "epoch": 1.4271356783919598, + "grad_norm": 0.3671875, + "learning_rate": 0.0001639336579305451, + "loss": 0.5211, + "step": 1420 + }, + { + "epoch": 1.4311557788944724, + "grad_norm": 0.38671875, + "learning_rate": 0.00016329840506551098, + "loss": 0.4785, + "step": 1424 + }, + { + "epoch": 1.4351758793969849, + "grad_norm": 0.3671875, + "learning_rate": 0.0001626629117068011, + "loss": 0.5316, + "step": 1428 + }, + { + "epoch": 1.4391959798994975, + "grad_norm": 0.404296875, + "learning_rate": 0.00016202718934693134, + "loss": 0.523, + "step": 1432 + }, + { + "epoch": 1.44321608040201, + "grad_norm": 0.44921875, + "learning_rate": 0.00016139124948255925, + "loss": 0.5512, + "step": 1436 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.373046875, + "learning_rate": 0.00016075510361427564, + "loss": 0.5319, + "step": 1440 + }, + { + "epoch": 1.451256281407035, + "grad_norm": 0.388671875, + "learning_rate": 0.0001601187632463968, + "loss": 0.4716, + "step": 1444 + }, + { + "epoch": 1.4552763819095478, + "grad_norm": 0.39453125, + "learning_rate": 0.00015948223988675644, + "loss": 0.5194, + "step": 1448 + }, + { + "epoch": 1.4592964824120602, + "grad_norm": 0.34765625, + "learning_rate": 0.00015884554504649764, + "loss": 0.5121, + "step": 1452 + }, + { + "epoch": 1.463316582914573, + "grad_norm": 0.375, + "learning_rate": 0.00015820869023986444, + "loss": 0.51, + "step": 1456 + }, + { + "epoch": 1.4673366834170856, + "grad_norm": 0.373046875, + "learning_rate": 0.00015757168698399387, + "loss": 0.5532, + "step": 1460 + }, + { + "epoch": 1.471356783919598, + "grad_norm": 0.384765625, + "learning_rate": 0.00015693454679870772, + "loss": 0.4899, + "step": 1464 + }, + { + "epoch": 1.4753768844221105, + "grad_norm": 0.37890625, + "learning_rate": 0.00015629728120630378, + "loss": 0.5007, + "step": 1468 + }, + { + "epoch": 1.4793969849246231, + "grad_norm": 0.33984375, + "learning_rate": 0.00015565990173134792, + "loss": 0.5182, + "step": 1472 + }, + { + "epoch": 1.4834170854271358, + "grad_norm": 0.376953125, + "learning_rate": 0.00015502241990046547, + "loss": 0.5502, + "step": 1476 + }, + { + "epoch": 1.4874371859296482, + "grad_norm": 0.35546875, + "learning_rate": 0.00015438484724213287, + "loss": 0.5105, + "step": 1480 + }, + { + "epoch": 1.4914572864321607, + "grad_norm": 0.400390625, + "learning_rate": 0.00015374719528646907, + "loss": 0.5128, + "step": 1484 + }, + { + "epoch": 1.4954773869346734, + "grad_norm": 0.388671875, + "learning_rate": 0.00015310947556502702, + "loss": 0.5355, + "step": 1488 + }, + { + "epoch": 1.499497487437186, + "grad_norm": 0.37890625, + "learning_rate": 0.00015247169961058523, + "loss": 0.5161, + "step": 1492 + }, + { + "epoch": 1.5035175879396985, + "grad_norm": 0.392578125, + "learning_rate": 0.00015183387895693911, + "loss": 0.5628, + "step": 1496 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.369140625, + "learning_rate": 0.00015119602513869249, + "loss": 0.5296, + "step": 1500 + }, + { + "epoch": 1.5115577889447236, + "grad_norm": 0.384765625, + "learning_rate": 0.00015055814969104893, + "loss": 0.5178, + "step": 1504 + }, + { + "epoch": 1.5155778894472363, + "grad_norm": 0.375, + "learning_rate": 0.00014992026414960313, + "loss": 0.53, + "step": 1508 + }, + { + "epoch": 1.5195979899497487, + "grad_norm": 0.36328125, + "learning_rate": 0.0001492823800501323, + "loss": 0.5137, + "step": 1512 + }, + { + "epoch": 1.5236180904522612, + "grad_norm": 0.369140625, + "learning_rate": 0.0001486445089283877, + "loss": 0.4953, + "step": 1516 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.365234375, + "learning_rate": 0.00014800666231988574, + "loss": 0.5631, + "step": 1520 + }, + { + "epoch": 1.5316582914572865, + "grad_norm": 0.39453125, + "learning_rate": 0.0001473688517596996, + "loss": 0.5299, + "step": 1524 + }, + { + "epoch": 1.5356783919597992, + "grad_norm": 0.3671875, + "learning_rate": 0.0001467310887822506, + "loss": 0.509, + "step": 1528 + }, + { + "epoch": 1.5396984924623116, + "grad_norm": 0.37109375, + "learning_rate": 0.00014609338492109944, + "loss": 0.5143, + "step": 1532 + }, + { + "epoch": 1.543718592964824, + "grad_norm": 0.375, + "learning_rate": 0.00014545575170873777, + "loss": 0.5166, + "step": 1536 + }, + { + "epoch": 1.5477386934673367, + "grad_norm": 0.36328125, + "learning_rate": 0.00014481820067637966, + "loss": 0.5479, + "step": 1540 + }, + { + "epoch": 1.5517587939698494, + "grad_norm": 0.408203125, + "learning_rate": 0.00014418074335375297, + "loss": 0.4979, + "step": 1544 + }, + { + "epoch": 1.5557788944723618, + "grad_norm": 0.37890625, + "learning_rate": 0.00014354339126889084, + "loss": 0.5331, + "step": 1548 + }, + { + "epoch": 1.5597989949748743, + "grad_norm": 0.375, + "learning_rate": 0.00014290615594792335, + "loss": 0.5257, + "step": 1552 + }, + { + "epoch": 1.563819095477387, + "grad_norm": 0.359375, + "learning_rate": 0.00014226904891486878, + "loss": 0.5104, + "step": 1556 + }, + { + "epoch": 1.5678391959798996, + "grad_norm": 0.357421875, + "learning_rate": 0.0001416320816914256, + "loss": 0.5514, + "step": 1560 + }, + { + "epoch": 1.571859296482412, + "grad_norm": 0.35546875, + "learning_rate": 0.00014099526579676387, + "loss": 0.49, + "step": 1564 + }, + { + "epoch": 1.5758793969849245, + "grad_norm": 0.369140625, + "learning_rate": 0.0001403586127473168, + "loss": 0.5235, + "step": 1568 + }, + { + "epoch": 1.5798994974874372, + "grad_norm": 0.390625, + "learning_rate": 0.0001397221340565729, + "loss": 0.5119, + "step": 1572 + }, + { + "epoch": 1.5839195979899499, + "grad_norm": 0.3984375, + "learning_rate": 0.00013908584123486736, + "loss": 0.4947, + "step": 1576 + }, + { + "epoch": 1.5879396984924623, + "grad_norm": 0.353515625, + "learning_rate": 0.00013844974578917395, + "loss": 0.4872, + "step": 1580 + }, + { + "epoch": 1.5919597989949748, + "grad_norm": 0.37890625, + "learning_rate": 0.0001378138592228971, + "loss": 0.547, + "step": 1584 + }, + { + "epoch": 1.5959798994974874, + "grad_norm": 0.369140625, + "learning_rate": 0.0001371781930356639, + "loss": 0.5132, + "step": 1588 + }, + { + "epoch": 1.6, + "grad_norm": 0.392578125, + "learning_rate": 0.00013654275872311588, + "loss": 0.5251, + "step": 1592 + }, + { + "epoch": 1.6040201005025125, + "grad_norm": 0.375, + "learning_rate": 0.00013590756777670133, + "loss": 0.506, + "step": 1596 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.37109375, + "learning_rate": 0.00013527263168346725, + "loss": 0.4885, + "step": 1600 + }, + { + "epoch": 1.6120603015075377, + "grad_norm": 0.369140625, + "learning_rate": 0.00013463796192585197, + "loss": 0.5728, + "step": 1604 + }, + { + "epoch": 1.6160804020100503, + "grad_norm": 0.353515625, + "learning_rate": 0.0001340035699814772, + "loss": 0.4981, + "step": 1608 + }, + { + "epoch": 1.6201005025125628, + "grad_norm": 0.396484375, + "learning_rate": 0.0001333694673229406, + "loss": 0.5385, + "step": 1612 + }, + { + "epoch": 1.6241206030150752, + "grad_norm": 0.34765625, + "learning_rate": 0.0001327356654176082, + "loss": 0.4765, + "step": 1616 + }, + { + "epoch": 1.6281407035175879, + "grad_norm": 0.34765625, + "learning_rate": 0.00013210217572740725, + "loss": 0.517, + "step": 1620 + }, + { + "epoch": 1.6321608040201006, + "grad_norm": 0.365234375, + "learning_rate": 0.00013146900970861856, + "loss": 0.5199, + "step": 1624 + }, + { + "epoch": 1.6361809045226132, + "grad_norm": 0.396484375, + "learning_rate": 0.00013083617881166971, + "loss": 0.5128, + "step": 1628 + }, + { + "epoch": 1.6402010050251257, + "grad_norm": 0.39453125, + "learning_rate": 0.0001302036944809277, + "loss": 0.5482, + "step": 1632 + }, + { + "epoch": 1.6442211055276381, + "grad_norm": 0.349609375, + "learning_rate": 0.00012957156815449216, + "loss": 0.4917, + "step": 1636 + }, + { + "epoch": 1.6482412060301508, + "grad_norm": 0.37890625, + "learning_rate": 0.00012893981126398837, + "loss": 0.515, + "step": 1640 + }, + { + "epoch": 1.6522613065326635, + "grad_norm": 0.3671875, + "learning_rate": 0.00012830843523436064, + "loss": 0.4891, + "step": 1644 + }, + { + "epoch": 1.656281407035176, + "grad_norm": 0.388671875, + "learning_rate": 0.00012767745148366556, + "loss": 0.5319, + "step": 1648 + }, + { + "epoch": 1.6603015075376883, + "grad_norm": 0.41796875, + "learning_rate": 0.00012704687142286563, + "loss": 0.4826, + "step": 1652 + }, + { + "epoch": 1.664321608040201, + "grad_norm": 0.35546875, + "learning_rate": 0.00012641670645562294, + "loss": 0.5107, + "step": 1656 + }, + { + "epoch": 1.6683417085427137, + "grad_norm": 0.416015625, + "learning_rate": 0.00012578696797809266, + "loss": 0.4988, + "step": 1660 + }, + { + "epoch": 1.6723618090452261, + "grad_norm": 0.35546875, + "learning_rate": 0.00012515766737871743, + "loss": 0.4736, + "step": 1664 + }, + { + "epoch": 1.6763819095477386, + "grad_norm": 0.357421875, + "learning_rate": 0.00012452881603802095, + "loss": 0.5087, + "step": 1668 + }, + { + "epoch": 1.6804020100502512, + "grad_norm": 0.375, + "learning_rate": 0.0001239004253284023, + "loss": 0.5101, + "step": 1672 + }, + { + "epoch": 1.684422110552764, + "grad_norm": 0.396484375, + "learning_rate": 0.00012327250661393037, + "loss": 0.5208, + "step": 1676 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.365234375, + "learning_rate": 0.0001226450712501384, + "loss": 0.5393, + "step": 1680 + }, + { + "epoch": 1.6924623115577888, + "grad_norm": 0.359375, + "learning_rate": 0.00012201813058381845, + "loss": 0.5202, + "step": 1684 + }, + { + "epoch": 1.6964824120603015, + "grad_norm": 0.36328125, + "learning_rate": 0.00012139169595281624, + "loss": 0.4888, + "step": 1688 + }, + { + "epoch": 1.7005025125628142, + "grad_norm": 0.3515625, + "learning_rate": 0.00012076577868582623, + "loss": 0.4949, + "step": 1692 + }, + { + "epoch": 1.7045226130653266, + "grad_norm": 0.37109375, + "learning_rate": 0.00012014039010218651, + "loss": 0.4858, + "step": 1696 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.384765625, + "learning_rate": 0.00011951554151167443, + "loss": 0.5161, + "step": 1700 + }, + { + "epoch": 1.7125628140703517, + "grad_norm": 0.392578125, + "learning_rate": 0.00011889124421430179, + "loss": 0.4953, + "step": 1704 + }, + { + "epoch": 1.7165829145728644, + "grad_norm": 0.369140625, + "learning_rate": 0.00011826750950011057, + "loss": 0.5068, + "step": 1708 + }, + { + "epoch": 1.7206030150753768, + "grad_norm": 0.34765625, + "learning_rate": 0.00011764434864896884, + "loss": 0.5539, + "step": 1712 + }, + { + "epoch": 1.7246231155778895, + "grad_norm": 0.388671875, + "learning_rate": 0.00011702177293036667, + "loss": 0.5099, + "step": 1716 + }, + { + "epoch": 1.728643216080402, + "grad_norm": 0.369140625, + "learning_rate": 0.0001163997936032123, + "loss": 0.5331, + "step": 1720 + }, + { + "epoch": 1.7326633165829146, + "grad_norm": 0.341796875, + "learning_rate": 0.00011577842191562864, + "loss": 0.5269, + "step": 1724 + }, + { + "epoch": 1.7366834170854273, + "grad_norm": 0.380859375, + "learning_rate": 0.00011515766910474989, + "loss": 0.4833, + "step": 1728 + }, + { + "epoch": 1.7407035175879397, + "grad_norm": 0.39453125, + "learning_rate": 0.00011453754639651804, + "loss": 0.5658, + "step": 1732 + }, + { + "epoch": 1.7447236180904522, + "grad_norm": 0.37890625, + "learning_rate": 0.00011391806500548021, + "loss": 0.5175, + "step": 1736 + }, + { + "epoch": 1.7487437185929648, + "grad_norm": 0.375, + "learning_rate": 0.00011329923613458571, + "loss": 0.5106, + "step": 1740 + }, + { + "epoch": 1.7527638190954775, + "grad_norm": 0.353515625, + "learning_rate": 0.00011268107097498322, + "loss": 0.4766, + "step": 1744 + }, + { + "epoch": 1.75678391959799, + "grad_norm": 0.3671875, + "learning_rate": 0.00011206358070581876, + "loss": 0.4809, + "step": 1748 + }, + { + "epoch": 1.7608040201005024, + "grad_norm": 0.36328125, + "learning_rate": 0.00011144677649403329, + "loss": 0.4951, + "step": 1752 + }, + { + "epoch": 1.764824120603015, + "grad_norm": 0.384765625, + "learning_rate": 0.00011083066949416092, + "loss": 0.5347, + "step": 1756 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.373046875, + "learning_rate": 0.00011021527084812704, + "loss": 0.5442, + "step": 1760 + }, + { + "epoch": 1.7728643216080402, + "grad_norm": 0.375, + "learning_rate": 0.00010960059168504694, + "loss": 0.5528, + "step": 1764 + }, + { + "epoch": 1.7768844221105526, + "grad_norm": 0.37890625, + "learning_rate": 0.00010898664312102425, + "loss": 0.5639, + "step": 1768 + }, + { + "epoch": 1.7809045226130653, + "grad_norm": 0.392578125, + "learning_rate": 0.00010837343625895054, + "loss": 0.5303, + "step": 1772 + }, + { + "epoch": 1.784924623115578, + "grad_norm": 0.3828125, + "learning_rate": 0.00010776098218830389, + "loss": 0.5272, + "step": 1776 + }, + { + "epoch": 1.7889447236180904, + "grad_norm": 0.3828125, + "learning_rate": 0.00010714929198494866, + "loss": 0.5386, + "step": 1780 + }, + { + "epoch": 1.7929648241206029, + "grad_norm": 0.380859375, + "learning_rate": 0.00010653837671093511, + "loss": 0.5112, + "step": 1784 + }, + { + "epoch": 1.7969849246231155, + "grad_norm": 0.369140625, + "learning_rate": 0.00010592824741429945, + "loss": 0.5354, + "step": 1788 + }, + { + "epoch": 1.8010050251256282, + "grad_norm": 0.365234375, + "learning_rate": 0.00010531891512886384, + "loss": 0.5214, + "step": 1792 + }, + { + "epoch": 1.8050251256281407, + "grad_norm": 0.40625, + "learning_rate": 0.00010471039087403705, + "loss": 0.5422, + "step": 1796 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.3515625, + "learning_rate": 0.00010410268565461506, + "loss": 0.5214, + "step": 1800 + }, + { + "epoch": 1.8130653266331658, + "grad_norm": 0.36328125, + "learning_rate": 0.00010349581046058215, + "loss": 0.5413, + "step": 1804 + }, + { + "epoch": 1.8170854271356784, + "grad_norm": 0.3671875, + "learning_rate": 0.00010288977626691202, + "loss": 0.5185, + "step": 1808 + }, + { + "epoch": 1.8211055276381911, + "grad_norm": 0.37109375, + "learning_rate": 0.00010228459403336941, + "loss": 0.538, + "step": 1812 + }, + { + "epoch": 1.8251256281407036, + "grad_norm": 0.38671875, + "learning_rate": 0.00010168027470431189, + "loss": 0.5167, + "step": 1816 + }, + { + "epoch": 1.829145728643216, + "grad_norm": 0.38671875, + "learning_rate": 0.00010107682920849185, + "loss": 0.5226, + "step": 1820 + }, + { + "epoch": 1.8331658291457287, + "grad_norm": 0.34765625, + "learning_rate": 0.00010047426845885903, + "loss": 0.4987, + "step": 1824 + }, + { + "epoch": 1.8371859296482413, + "grad_norm": 0.39453125, + "learning_rate": 9.987260335236297e-05, + "loss": 0.5597, + "step": 1828 + }, + { + "epoch": 1.8412060301507538, + "grad_norm": 0.375, + "learning_rate": 9.92718447697562e-05, + "loss": 0.5334, + "step": 1832 + }, + { + "epoch": 1.8452261306532662, + "grad_norm": 0.384765625, + "learning_rate": 9.867200357539708e-05, + "loss": 0.4883, + "step": 1836 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.35546875, + "learning_rate": 9.807309061705372e-05, + "loss": 0.5091, + "step": 1840 + }, + { + "epoch": 1.8532663316582916, + "grad_norm": 0.37890625, + "learning_rate": 9.747511672570755e-05, + "loss": 0.5562, + "step": 1844 + }, + { + "epoch": 1.857286432160804, + "grad_norm": 0.36328125, + "learning_rate": 9.687809271535762e-05, + "loss": 0.5292, + "step": 1848 + }, + { + "epoch": 1.8613065326633165, + "grad_norm": 0.365234375, + "learning_rate": 9.628202938282493e-05, + "loss": 0.4938, + "step": 1852 + }, + { + "epoch": 1.8653266331658291, + "grad_norm": 0.376953125, + "learning_rate": 9.568693750755723e-05, + "loss": 0.5205, + "step": 1856 + }, + { + "epoch": 1.8693467336683418, + "grad_norm": 0.357421875, + "learning_rate": 9.50928278514338e-05, + "loss": 0.5144, + "step": 1860 + }, + { + "epoch": 1.8733668341708543, + "grad_norm": 0.388671875, + "learning_rate": 9.449971115857143e-05, + "loss": 0.5513, + "step": 1864 + }, + { + "epoch": 1.8773869346733667, + "grad_norm": 0.3828125, + "learning_rate": 9.390759815512959e-05, + "loss": 0.5286, + "step": 1868 + }, + { + "epoch": 1.8814070351758794, + "grad_norm": 0.388671875, + "learning_rate": 9.331649954911662e-05, + "loss": 0.5448, + "step": 1872 + }, + { + "epoch": 1.885427135678392, + "grad_norm": 0.373046875, + "learning_rate": 9.272642603019611e-05, + "loss": 0.5206, + "step": 1876 + }, + { + "epoch": 1.8894472361809045, + "grad_norm": 0.359375, + "learning_rate": 9.213738826949364e-05, + "loss": 0.5171, + "step": 1880 + }, + { + "epoch": 1.893467336683417, + "grad_norm": 0.38671875, + "learning_rate": 9.154939691940357e-05, + "loss": 0.5161, + "step": 1884 + }, + { + "epoch": 1.8974874371859296, + "grad_norm": 0.369140625, + "learning_rate": 9.096246261339669e-05, + "loss": 0.5415, + "step": 1888 + }, + { + "epoch": 1.9015075376884423, + "grad_norm": 0.373046875, + "learning_rate": 9.03765959658277e-05, + "loss": 0.5354, + "step": 1892 + }, + { + "epoch": 1.9055276381909547, + "grad_norm": 0.3671875, + "learning_rate": 8.979180757174341e-05, + "loss": 0.5186, + "step": 1896 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.37890625, + "learning_rate": 8.920810800669098e-05, + "loss": 0.5099, + "step": 1900 + }, + { + "epoch": 1.9135678391959798, + "grad_norm": 0.380859375, + "learning_rate": 8.862550782652686e-05, + "loss": 0.4908, + "step": 1904 + }, + { + "epoch": 1.9175879396984925, + "grad_norm": 0.3828125, + "learning_rate": 8.804401756722564e-05, + "loss": 0.5253, + "step": 1908 + }, + { + "epoch": 1.9216080402010052, + "grad_norm": 0.37890625, + "learning_rate": 8.746364774468973e-05, + "loss": 0.5318, + "step": 1912 + }, + { + "epoch": 1.9256281407035176, + "grad_norm": 0.37890625, + "learning_rate": 8.688440885455922e-05, + "loss": 0.516, + "step": 1916 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.392578125, + "learning_rate": 8.630631137202172e-05, + "loss": 0.5279, + "step": 1920 + }, + { + "epoch": 1.9336683417085427, + "grad_norm": 0.36328125, + "learning_rate": 8.572936575162345e-05, + "loss": 0.522, + "step": 1924 + }, + { + "epoch": 1.9376884422110554, + "grad_norm": 0.384765625, + "learning_rate": 8.515358242707971e-05, + "loss": 0.5256, + "step": 1928 + }, + { + "epoch": 1.9417085427135679, + "grad_norm": 0.3671875, + "learning_rate": 8.457897181108633e-05, + "loss": 0.5411, + "step": 1932 + }, + { + "epoch": 1.9457286432160803, + "grad_norm": 0.390625, + "learning_rate": 8.400554429513164e-05, + "loss": 0.5088, + "step": 1936 + }, + { + "epoch": 1.949748743718593, + "grad_norm": 0.37109375, + "learning_rate": 8.343331024930805e-05, + "loss": 0.5169, + "step": 1940 + }, + { + "epoch": 1.9537688442211056, + "grad_norm": 0.37890625, + "learning_rate": 8.286228002212506e-05, + "loss": 0.5431, + "step": 1944 + }, + { + "epoch": 1.957788944723618, + "grad_norm": 0.359375, + "learning_rate": 8.229246394032151e-05, + "loss": 0.4912, + "step": 1948 + }, + { + "epoch": 1.9618090452261305, + "grad_norm": 0.359375, + "learning_rate": 8.172387230867946e-05, + "loss": 0.504, + "step": 1952 + }, + { + "epoch": 1.9658291457286432, + "grad_norm": 0.37109375, + "learning_rate": 8.115651540983735e-05, + "loss": 0.5055, + "step": 1956 + }, + { + "epoch": 1.9698492462311559, + "grad_norm": 0.357421875, + "learning_rate": 8.059040350410414e-05, + "loss": 0.4912, + "step": 1960 + }, + { + "epoch": 1.9738693467336683, + "grad_norm": 0.40625, + "learning_rate": 8.00255468292741e-05, + "loss": 0.519, + "step": 1964 + }, + { + "epoch": 1.9778894472361808, + "grad_norm": 0.38671875, + "learning_rate": 7.946195560044113e-05, + "loss": 0.5418, + "step": 1968 + }, + { + "epoch": 1.9819095477386934, + "grad_norm": 0.37890625, + "learning_rate": 7.889964000981446e-05, + "loss": 0.5355, + "step": 1972 + }, + { + "epoch": 1.985929648241206, + "grad_norm": 0.359375, + "learning_rate": 7.833861022653428e-05, + "loss": 0.4856, + "step": 1976 + }, + { + "epoch": 1.9899497487437185, + "grad_norm": 0.375, + "learning_rate": 7.777887639648728e-05, + "loss": 0.5165, + "step": 1980 + }, + { + "epoch": 1.993969849246231, + "grad_norm": 0.400390625, + "learning_rate": 7.722044864212408e-05, + "loss": 0.5453, + "step": 1984 + }, + { + "epoch": 1.9979899497487437, + "grad_norm": 0.3359375, + "learning_rate": 7.666333706227556e-05, + "loss": 0.5159, + "step": 1988 + }, + { + "epoch": 2.0020100502512563, + "grad_norm": 0.337890625, + "learning_rate": 7.610755173197023e-05, + "loss": 0.4954, + "step": 1992 + }, + { + "epoch": 2.006030150753769, + "grad_norm": 0.328125, + "learning_rate": 7.555310270225238e-05, + "loss": 0.4156, + "step": 1996 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.353515625, + "learning_rate": 7.500000000000002e-05, + "loss": 0.4518, + "step": 2000 + }, + { + "epoch": 2.014070351758794, + "grad_norm": 0.345703125, + "learning_rate": 7.444825362774351e-05, + "loss": 0.4622, + "step": 2004 + }, + { + "epoch": 2.0180904522613066, + "grad_norm": 0.361328125, + "learning_rate": 7.389787356348506e-05, + "loss": 0.502, + "step": 2008 + }, + { + "epoch": 2.0221105527638192, + "grad_norm": 0.359375, + "learning_rate": 7.334886976051775e-05, + "loss": 0.4602, + "step": 2012 + }, + { + "epoch": 2.0261306532663315, + "grad_norm": 0.353515625, + "learning_rate": 7.280125214724605e-05, + "loss": 0.4535, + "step": 2016 + }, + { + "epoch": 2.030150753768844, + "grad_norm": 0.3359375, + "learning_rate": 7.22550306270058e-05, + "loss": 0.4341, + "step": 2020 + }, + { + "epoch": 2.034170854271357, + "grad_norm": 0.361328125, + "learning_rate": 7.171021507788542e-05, + "loss": 0.3997, + "step": 2024 + }, + { + "epoch": 2.0381909547738695, + "grad_norm": 0.37890625, + "learning_rate": 7.116681535254728e-05, + "loss": 0.4226, + "step": 2028 + }, + { + "epoch": 2.0422110552763817, + "grad_norm": 0.34375, + "learning_rate": 7.062484127804927e-05, + "loss": 0.4244, + "step": 2032 + }, + { + "epoch": 2.0462311557788944, + "grad_norm": 0.369140625, + "learning_rate": 7.008430265566738e-05, + "loss": 0.4473, + "step": 2036 + }, + { + "epoch": 2.050251256281407, + "grad_norm": 0.345703125, + "learning_rate": 6.954520926071815e-05, + "loss": 0.4669, + "step": 2040 + }, + { + "epoch": 2.0542713567839197, + "grad_norm": 0.373046875, + "learning_rate": 6.900757084238225e-05, + "loss": 0.4699, + "step": 2044 + }, + { + "epoch": 2.0582914572864324, + "grad_norm": 0.369140625, + "learning_rate": 6.847139712352783e-05, + "loss": 0.3894, + "step": 2048 + }, + { + "epoch": 2.0623115577889446, + "grad_norm": 0.349609375, + "learning_rate": 6.793669780053477e-05, + "loss": 0.43, + "step": 2052 + }, + { + "epoch": 2.0663316582914573, + "grad_norm": 0.35546875, + "learning_rate": 6.740348254311956e-05, + "loss": 0.4588, + "step": 2056 + }, + { + "epoch": 2.07035175879397, + "grad_norm": 0.361328125, + "learning_rate": 6.687176099416022e-05, + "loss": 0.477, + "step": 2060 + }, + { + "epoch": 2.0743718592964826, + "grad_norm": 0.3671875, + "learning_rate": 6.634154276952179e-05, + "loss": 0.4173, + "step": 2064 + }, + { + "epoch": 2.078391959798995, + "grad_norm": 0.365234375, + "learning_rate": 6.581283745788287e-05, + "loss": 0.4395, + "step": 2068 + }, + { + "epoch": 2.0824120603015075, + "grad_norm": 0.36328125, + "learning_rate": 6.528565462056154e-05, + "loss": 0.455, + "step": 2072 + }, + { + "epoch": 2.08643216080402, + "grad_norm": 0.3828125, + "learning_rate": 6.47600037913432e-05, + "loss": 0.4537, + "step": 2076 + }, + { + "epoch": 2.090452261306533, + "grad_norm": 0.3671875, + "learning_rate": 6.423589447630772e-05, + "loss": 0.4805, + "step": 2080 + }, + { + "epoch": 2.094472361809045, + "grad_norm": 0.35546875, + "learning_rate": 6.371333615365747e-05, + "loss": 0.447, + "step": 2084 + }, + { + "epoch": 2.0984924623115577, + "grad_norm": 0.392578125, + "learning_rate": 6.319233827354623e-05, + "loss": 0.4475, + "step": 2088 + }, + { + "epoch": 2.1025125628140704, + "grad_norm": 0.37109375, + "learning_rate": 6.267291025790803e-05, + "loss": 0.4904, + "step": 2092 + }, + { + "epoch": 2.106532663316583, + "grad_norm": 0.37109375, + "learning_rate": 6.215506150028676e-05, + "loss": 0.4489, + "step": 2096 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.34765625, + "learning_rate": 6.163880136566658e-05, + "loss": 0.4131, + "step": 2100 + }, + { + "epoch": 2.114572864321608, + "grad_norm": 0.373046875, + "learning_rate": 6.112413919030214e-05, + "loss": 0.4356, + "step": 2104 + }, + { + "epoch": 2.1185929648241206, + "grad_norm": 0.369140625, + "learning_rate": 6.061108428155014e-05, + "loss": 0.4512, + "step": 2108 + }, + { + "epoch": 2.1226130653266333, + "grad_norm": 0.361328125, + "learning_rate": 6.0099645917700655e-05, + "loss": 0.4485, + "step": 2112 + }, + { + "epoch": 2.1266331658291455, + "grad_norm": 0.384765625, + "learning_rate": 5.9589833347809726e-05, + "loss": 0.4486, + "step": 2116 + }, + { + "epoch": 2.130653266331658, + "grad_norm": 0.375, + "learning_rate": 5.908165579153169e-05, + "loss": 0.4717, + "step": 2120 + }, + { + "epoch": 2.134673366834171, + "grad_norm": 0.37890625, + "learning_rate": 5.857512243895267e-05, + "loss": 0.4452, + "step": 2124 + }, + { + "epoch": 2.1386934673366835, + "grad_norm": 0.3828125, + "learning_rate": 5.8070242450424495e-05, + "loss": 0.4552, + "step": 2128 + }, + { + "epoch": 2.1427135678391958, + "grad_norm": 0.37890625, + "learning_rate": 5.756702495639871e-05, + "loss": 0.4634, + "step": 2132 + }, + { + "epoch": 2.1467336683417084, + "grad_norm": 0.369140625, + "learning_rate": 5.706547905726178e-05, + "loss": 0.4308, + "step": 2136 + }, + { + "epoch": 2.150753768844221, + "grad_norm": 0.359375, + "learning_rate": 5.656561382317047e-05, + "loss": 0.4589, + "step": 2140 + }, + { + "epoch": 2.1547738693467338, + "grad_norm": 0.376953125, + "learning_rate": 5.6067438293887346e-05, + "loss": 0.4743, + "step": 2144 + }, + { + "epoch": 2.1587939698492464, + "grad_norm": 0.375, + "learning_rate": 5.557096147861804e-05, + "loss": 0.4762, + "step": 2148 + }, + { + "epoch": 2.1628140703517587, + "grad_norm": 0.3984375, + "learning_rate": 5.50761923558479e-05, + "loss": 0.4836, + "step": 2152 + }, + { + "epoch": 2.1668341708542713, + "grad_norm": 0.38671875, + "learning_rate": 5.458313987317952e-05, + "loss": 0.4698, + "step": 2156 + }, + { + "epoch": 2.170854271356784, + "grad_norm": 0.3828125, + "learning_rate": 5.4091812947171285e-05, + "loss": 0.4198, + "step": 2160 + }, + { + "epoch": 2.1748743718592967, + "grad_norm": 0.373046875, + "learning_rate": 5.3602220463175784e-05, + "loss": 0.4556, + "step": 2164 + }, + { + "epoch": 2.178894472361809, + "grad_norm": 0.404296875, + "learning_rate": 5.3114371275179254e-05, + "loss": 0.4487, + "step": 2168 + }, + { + "epoch": 2.1829145728643216, + "grad_norm": 0.390625, + "learning_rate": 5.262827420564162e-05, + "loss": 0.4466, + "step": 2172 + }, + { + "epoch": 2.1869346733668342, + "grad_norm": 0.36328125, + "learning_rate": 5.214393804533662e-05, + "loss": 0.4432, + "step": 2176 + }, + { + "epoch": 2.190954773869347, + "grad_norm": 0.384765625, + "learning_rate": 5.166137155319317e-05, + "loss": 0.4661, + "step": 2180 + }, + { + "epoch": 2.194974874371859, + "grad_norm": 0.373046875, + "learning_rate": 5.118058345613661e-05, + "loss": 0.4527, + "step": 2184 + }, + { + "epoch": 2.198994974874372, + "grad_norm": 0.357421875, + "learning_rate": 5.0701582448931284e-05, + "loss": 0.4467, + "step": 2188 + }, + { + "epoch": 2.2030150753768845, + "grad_norm": 0.37890625, + "learning_rate": 5.0224377194022936e-05, + "loss": 0.4384, + "step": 2192 + }, + { + "epoch": 2.207035175879397, + "grad_norm": 0.375, + "learning_rate": 4.974897632138219e-05, + "loss": 0.4337, + "step": 2196 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.361328125, + "learning_rate": 4.927538842834865e-05, + "loss": 0.427, + "step": 2200 + }, + { + "epoch": 2.215075376884422, + "grad_norm": 0.357421875, + "learning_rate": 4.880362207947512e-05, + "loss": 0.4358, + "step": 2204 + }, + { + "epoch": 2.2190954773869347, + "grad_norm": 0.373046875, + "learning_rate": 4.8333685806373014e-05, + "loss": 0.4364, + "step": 2208 + }, + { + "epoch": 2.2231155778894474, + "grad_norm": 0.369140625, + "learning_rate": 4.7865588107557773e-05, + "loss": 0.4491, + "step": 2212 + }, + { + "epoch": 2.22713567839196, + "grad_norm": 0.3671875, + "learning_rate": 4.7399337448295386e-05, + "loss": 0.4603, + "step": 2216 + }, + { + "epoch": 2.2311557788944723, + "grad_norm": 0.3984375, + "learning_rate": 4.6934942260449314e-05, + "loss": 0.4182, + "step": 2220 + }, + { + "epoch": 2.235175879396985, + "grad_norm": 0.365234375, + "learning_rate": 4.6472410942327806e-05, + "loss": 0.4323, + "step": 2224 + }, + { + "epoch": 2.2391959798994976, + "grad_norm": 0.353515625, + "learning_rate": 4.601175185853222e-05, + "loss": 0.434, + "step": 2228 + }, + { + "epoch": 2.24321608040201, + "grad_norm": 0.369140625, + "learning_rate": 4.5552973339805775e-05, + "loss": 0.435, + "step": 2232 + }, + { + "epoch": 2.2472361809045225, + "grad_norm": 0.36328125, + "learning_rate": 4.509608368288249e-05, + "loss": 0.4539, + "step": 2236 + }, + { + "epoch": 2.251256281407035, + "grad_norm": 0.38671875, + "learning_rate": 4.4641091150337774e-05, + "loss": 0.454, + "step": 2240 + }, + { + "epoch": 2.255276381909548, + "grad_norm": 0.40234375, + "learning_rate": 4.418800397043857e-05, + "loss": 0.4908, + "step": 2244 + }, + { + "epoch": 2.2592964824120605, + "grad_norm": 0.369140625, + "learning_rate": 4.373683033699459e-05, + "loss": 0.4652, + "step": 2248 + }, + { + "epoch": 2.2633165829145727, + "grad_norm": 0.365234375, + "learning_rate": 4.328757840921033e-05, + "loss": 0.4229, + "step": 2252 + }, + { + "epoch": 2.2673366834170854, + "grad_norm": 0.396484375, + "learning_rate": 4.2840256311537305e-05, + "loss": 0.462, + "step": 2256 + }, + { + "epoch": 2.271356783919598, + "grad_norm": 0.38671875, + "learning_rate": 4.239487213352716e-05, + "loss": 0.449, + "step": 2260 + }, + { + "epoch": 2.2753768844221107, + "grad_norm": 0.40625, + "learning_rate": 4.195143392968563e-05, + "loss": 0.4464, + "step": 2264 + }, + { + "epoch": 2.279396984924623, + "grad_norm": 0.373046875, + "learning_rate": 4.150994971932643e-05, + "loss": 0.4218, + "step": 2268 + }, + { + "epoch": 2.2834170854271356, + "grad_norm": 0.38671875, + "learning_rate": 4.1070427486426674e-05, + "loss": 0.4373, + "step": 2272 + }, + { + "epoch": 2.2874371859296483, + "grad_norm": 0.357421875, + "learning_rate": 4.0632875179482114e-05, + "loss": 0.4874, + "step": 2276 + }, + { + "epoch": 2.291457286432161, + "grad_norm": 0.359375, + "learning_rate": 4.019730071136379e-05, + "loss": 0.4531, + "step": 2280 + }, + { + "epoch": 2.295477386934673, + "grad_norm": 0.369140625, + "learning_rate": 3.97637119591745e-05, + "loss": 0.4166, + "step": 2284 + }, + { + "epoch": 2.299497487437186, + "grad_norm": 0.39453125, + "learning_rate": 3.933211676410664e-05, + "loss": 0.4153, + "step": 2288 + }, + { + "epoch": 2.3035175879396985, + "grad_norm": 0.392578125, + "learning_rate": 3.8902522931300416e-05, + "loss": 0.4459, + "step": 2292 + }, + { + "epoch": 2.307537688442211, + "grad_norm": 0.369140625, + "learning_rate": 3.847493822970241e-05, + "loss": 0.4454, + "step": 2296 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.3671875, + "learning_rate": 3.8049370391925424e-05, + "loss": 0.4405, + "step": 2300 + }, + { + "epoch": 2.315577889447236, + "grad_norm": 0.35546875, + "learning_rate": 3.7625827114108533e-05, + "loss": 0.4672, + "step": 2304 + }, + { + "epoch": 2.3195979899497488, + "grad_norm": 0.376953125, + "learning_rate": 3.72043160557776e-05, + "loss": 0.4154, + "step": 2308 + }, + { + "epoch": 2.3236180904522614, + "grad_norm": 0.365234375, + "learning_rate": 3.678484483970731e-05, + "loss": 0.4802, + "step": 2312 + }, + { + "epoch": 2.327638190954774, + "grad_norm": 0.37890625, + "learning_rate": 3.636742105178281e-05, + "loss": 0.4569, + "step": 2316 + }, + { + "epoch": 2.3316582914572863, + "grad_norm": 0.39453125, + "learning_rate": 3.5952052240862885e-05, + "loss": 0.4539, + "step": 2320 + }, + { + "epoch": 2.335678391959799, + "grad_norm": 0.361328125, + "learning_rate": 3.553874591864325e-05, + "loss": 0.4037, + "step": 2324 + }, + { + "epoch": 2.3396984924623117, + "grad_norm": 0.353515625, + "learning_rate": 3.5127509559520715e-05, + "loss": 0.4787, + "step": 2328 + }, + { + "epoch": 2.343718592964824, + "grad_norm": 0.3515625, + "learning_rate": 3.471835060045804e-05, + "loss": 0.3874, + "step": 2332 + }, + { + "epoch": 2.3477386934673365, + "grad_norm": 0.3828125, + "learning_rate": 3.431127644084953e-05, + "loss": 0.4441, + "step": 2336 + }, + { + "epoch": 2.351758793969849, + "grad_norm": 0.369140625, + "learning_rate": 3.390629444238704e-05, + "loss": 0.4572, + "step": 2340 + }, + { + "epoch": 2.355778894472362, + "grad_norm": 0.376953125, + "learning_rate": 3.350341192892708e-05, + "loss": 0.4298, + "step": 2344 + }, + { + "epoch": 2.3597989949748746, + "grad_norm": 0.345703125, + "learning_rate": 3.310263618635807e-05, + "loss": 0.4108, + "step": 2348 + }, + { + "epoch": 2.363819095477387, + "grad_norm": 0.388671875, + "learning_rate": 3.2703974462468896e-05, + "loss": 0.4497, + "step": 2352 + }, + { + "epoch": 2.3678391959798994, + "grad_norm": 0.365234375, + "learning_rate": 3.2307433966817594e-05, + "loss": 0.4525, + "step": 2356 + }, + { + "epoch": 2.371859296482412, + "grad_norm": 0.37890625, + "learning_rate": 3.191302187060103e-05, + "loss": 0.4586, + "step": 2360 + }, + { + "epoch": 2.375879396984925, + "grad_norm": 0.376953125, + "learning_rate": 3.152074530652539e-05, + "loss": 0.4659, + "step": 2364 + }, + { + "epoch": 2.379899497487437, + "grad_norm": 0.353515625, + "learning_rate": 3.113061136867685e-05, + "loss": 0.4744, + "step": 2368 + }, + { + "epoch": 2.3839195979899497, + "grad_norm": 0.345703125, + "learning_rate": 3.074262711239367e-05, + "loss": 0.4177, + "step": 2372 + }, + { + "epoch": 2.3879396984924623, + "grad_norm": 0.359375, + "learning_rate": 3.0356799554138256e-05, + "loss": 0.4621, + "step": 2376 + }, + { + "epoch": 2.391959798994975, + "grad_norm": 0.357421875, + "learning_rate": 2.9973135671370452e-05, + "loss": 0.4114, + "step": 2380 + }, + { + "epoch": 2.3959798994974877, + "grad_norm": 0.337890625, + "learning_rate": 2.959164240242145e-05, + "loss": 0.4193, + "step": 2384 + }, + { + "epoch": 2.4, + "grad_norm": 0.365234375, + "learning_rate": 2.9212326646367995e-05, + "loss": 0.4469, + "step": 2388 + }, + { + "epoch": 2.4040201005025126, + "grad_norm": 0.357421875, + "learning_rate": 2.883519526290798e-05, + "loss": 0.4398, + "step": 2392 + }, + { + "epoch": 2.4080402010050252, + "grad_norm": 0.39453125, + "learning_rate": 2.8460255072236226e-05, + "loss": 0.4424, + "step": 2396 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.373046875, + "learning_rate": 2.8087512854921044e-05, + "loss": 0.4459, + "step": 2400 + }, + { + "epoch": 2.41608040201005, + "grad_norm": 0.37890625, + "learning_rate": 2.7716975351781772e-05, + "loss": 0.4363, + "step": 2404 + }, + { + "epoch": 2.420100502512563, + "grad_norm": 0.376953125, + "learning_rate": 2.734864926376677e-05, + "loss": 0.4646, + "step": 2408 + }, + { + "epoch": 2.4241206030150755, + "grad_norm": 0.357421875, + "learning_rate": 2.698254125183236e-05, + "loss": 0.4419, + "step": 2412 + }, + { + "epoch": 2.428140703517588, + "grad_norm": 0.353515625, + "learning_rate": 2.6618657936822308e-05, + "loss": 0.421, + "step": 2416 + }, + { + "epoch": 2.4321608040201004, + "grad_norm": 0.369140625, + "learning_rate": 2.6257005899347962e-05, + "loss": 0.4533, + "step": 2420 + }, + { + "epoch": 2.436180904522613, + "grad_norm": 0.357421875, + "learning_rate": 2.5897591679669367e-05, + "loss": 0.4531, + "step": 2424 + }, + { + "epoch": 2.4402010050251257, + "grad_norm": 0.341796875, + "learning_rate": 2.5540421777577114e-05, + "loss": 0.3943, + "step": 2428 + }, + { + "epoch": 2.4442211055276384, + "grad_norm": 0.361328125, + "learning_rate": 2.5185502652274475e-05, + "loss": 0.4607, + "step": 2432 + }, + { + "epoch": 2.4482412060301506, + "grad_norm": 0.388671875, + "learning_rate": 2.4832840722260915e-05, + "loss": 0.4372, + "step": 2436 + }, + { + "epoch": 2.4522613065326633, + "grad_norm": 0.36328125, + "learning_rate": 2.4482442365215788e-05, + "loss": 0.4316, + "step": 2440 + }, + { + "epoch": 2.456281407035176, + "grad_norm": 0.392578125, + "learning_rate": 2.413431391788317e-05, + "loss": 0.4207, + "step": 2444 + }, + { + "epoch": 2.4603015075376886, + "grad_norm": 0.369140625, + "learning_rate": 2.3788461675957094e-05, + "loss": 0.4368, + "step": 2448 + }, + { + "epoch": 2.464321608040201, + "grad_norm": 0.37890625, + "learning_rate": 2.3444891893967804e-05, + "loss": 0.4642, + "step": 2452 + }, + { + "epoch": 2.4683417085427135, + "grad_norm": 0.37890625, + "learning_rate": 2.3103610785168714e-05, + "loss": 0.4541, + "step": 2456 + }, + { + "epoch": 2.472361809045226, + "grad_norm": 0.375, + "learning_rate": 2.2764624521423824e-05, + "loss": 0.4488, + "step": 2460 + }, + { + "epoch": 2.476381909547739, + "grad_norm": 0.36328125, + "learning_rate": 2.2427939233096355e-05, + "loss": 0.446, + "step": 2464 + }, + { + "epoch": 2.480402010050251, + "grad_norm": 0.34375, + "learning_rate": 2.2093561008937703e-05, + "loss": 0.4293, + "step": 2468 + }, + { + "epoch": 2.4844221105527637, + "grad_norm": 0.369140625, + "learning_rate": 2.1761495895977372e-05, + "loss": 0.4381, + "step": 2472 + }, + { + "epoch": 2.4884422110552764, + "grad_norm": 0.373046875, + "learning_rate": 2.1431749899413726e-05, + "loss": 0.4362, + "step": 2476 + }, + { + "epoch": 2.492462311557789, + "grad_norm": 0.359375, + "learning_rate": 2.1104328982505185e-05, + "loss": 0.407, + "step": 2480 + }, + { + "epoch": 2.4964824120603017, + "grad_norm": 0.3828125, + "learning_rate": 2.0779239066462595e-05, + "loss": 0.4392, + "step": 2484 + }, + { + "epoch": 2.500502512562814, + "grad_norm": 0.392578125, + "learning_rate": 2.0456486030342057e-05, + "loss": 0.4642, + "step": 2488 + }, + { + "epoch": 2.5045226130653266, + "grad_norm": 0.384765625, + "learning_rate": 2.013607571093852e-05, + "loss": 0.4557, + "step": 2492 + }, + { + "epoch": 2.5085427135678393, + "grad_norm": 0.380859375, + "learning_rate": 1.981801390268034e-05, + "loss": 0.4448, + "step": 2496 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.376953125, + "learning_rate": 1.9502306357524443e-05, + "loss": 0.4698, + "step": 2500 + } + ], + "logging_steps": 4, + "max_steps": 2985, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6357403205555978e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}