{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5088, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019654088050314466, "grad_norm": 2.328125, "learning_rate": 1.9646365422396855e-07, "loss": 2.5088, "step": 1 }, { "epoch": 0.0009827044025157233, "grad_norm": 2.3125, "learning_rate": 9.823182711198429e-07, "loss": 2.5011, "step": 5 }, { "epoch": 0.0019654088050314465, "grad_norm": 2.34375, "learning_rate": 1.9646365422396858e-06, "loss": 2.507, "step": 10 }, { "epoch": 0.00294811320754717, "grad_norm": 2.28125, "learning_rate": 2.946954813359529e-06, "loss": 2.495, "step": 15 }, { "epoch": 0.003930817610062893, "grad_norm": 2.25, "learning_rate": 3.9292730844793715e-06, "loss": 2.4677, "step": 20 }, { "epoch": 0.004913522012578616, "grad_norm": 2.109375, "learning_rate": 4.911591355599214e-06, "loss": 2.4155, "step": 25 }, { "epoch": 0.00589622641509434, "grad_norm": 1.9609375, "learning_rate": 5.893909626719058e-06, "loss": 2.3711, "step": 30 }, { "epoch": 0.006878930817610063, "grad_norm": 1.78125, "learning_rate": 6.876227897838901e-06, "loss": 2.3188, "step": 35 }, { "epoch": 0.007861635220125786, "grad_norm": 1.6328125, "learning_rate": 7.858546168958743e-06, "loss": 2.2165, "step": 40 }, { "epoch": 0.00884433962264151, "grad_norm": 1.3125, "learning_rate": 8.840864440078587e-06, "loss": 2.1073, "step": 45 }, { "epoch": 0.009827044025157232, "grad_norm": 1.046875, "learning_rate": 9.823182711198428e-06, "loss": 2.0433, "step": 50 }, { "epoch": 0.010809748427672955, "grad_norm": 0.921875, "learning_rate": 1.0805500982318271e-05, "loss": 1.9535, "step": 55 }, { "epoch": 0.01179245283018868, "grad_norm": 0.80859375, "learning_rate": 1.1787819253438115e-05, "loss": 1.8931, "step": 60 }, { "epoch": 0.012775157232704403, "grad_norm": 0.6875, "learning_rate": 1.2770137524557958e-05, "loss": 1.8378, "step": 65 }, { "epoch": 0.013757861635220126, "grad_norm": 0.546875, "learning_rate": 1.3752455795677802e-05, "loss": 1.7701, "step": 70 }, { "epoch": 0.01474056603773585, "grad_norm": 0.4609375, "learning_rate": 1.4734774066797644e-05, "loss": 1.7103, "step": 75 }, { "epoch": 0.015723270440251572, "grad_norm": 0.447265625, "learning_rate": 1.5717092337917486e-05, "loss": 1.6636, "step": 80 }, { "epoch": 0.016705974842767295, "grad_norm": 0.427734375, "learning_rate": 1.669941060903733e-05, "loss": 1.6086, "step": 85 }, { "epoch": 0.01768867924528302, "grad_norm": 0.384765625, "learning_rate": 1.7681728880157174e-05, "loss": 1.5668, "step": 90 }, { "epoch": 0.01867138364779874, "grad_norm": 0.36328125, "learning_rate": 1.8664047151277013e-05, "loss": 1.5144, "step": 95 }, { "epoch": 0.019654088050314465, "grad_norm": 0.365234375, "learning_rate": 1.9646365422396855e-05, "loss": 1.4948, "step": 100 }, { "epoch": 0.020636792452830188, "grad_norm": 0.375, "learning_rate": 2.06286836935167e-05, "loss": 1.4615, "step": 105 }, { "epoch": 0.02161949685534591, "grad_norm": 0.41015625, "learning_rate": 2.1611001964636543e-05, "loss": 1.4433, "step": 110 }, { "epoch": 0.022602201257861634, "grad_norm": 0.5703125, "learning_rate": 2.2593320235756385e-05, "loss": 1.4125, "step": 115 }, { "epoch": 0.02358490566037736, "grad_norm": 0.337890625, "learning_rate": 2.357563850687623e-05, "loss": 1.408, "step": 120 }, { "epoch": 0.024567610062893083, "grad_norm": 0.50390625, "learning_rate": 2.4557956777996073e-05, "loss": 1.4023, "step": 125 }, { "epoch": 0.025550314465408806, "grad_norm": 0.24609375, "learning_rate": 2.5540275049115915e-05, "loss": 1.3823, "step": 130 }, { "epoch": 0.02653301886792453, "grad_norm": 0.2314453125, "learning_rate": 2.6522593320235754e-05, "loss": 1.3891, "step": 135 }, { "epoch": 0.027515723270440252, "grad_norm": 0.2236328125, "learning_rate": 2.7504911591355603e-05, "loss": 1.356, "step": 140 }, { "epoch": 0.028498427672955975, "grad_norm": 0.2177734375, "learning_rate": 2.8487229862475445e-05, "loss": 1.3275, "step": 145 }, { "epoch": 0.0294811320754717, "grad_norm": 0.216796875, "learning_rate": 2.9469548133595288e-05, "loss": 1.3491, "step": 150 }, { "epoch": 0.03046383647798742, "grad_norm": 0.23828125, "learning_rate": 3.045186640471513e-05, "loss": 1.3364, "step": 155 }, { "epoch": 0.031446540880503145, "grad_norm": 0.2275390625, "learning_rate": 3.143418467583497e-05, "loss": 1.3052, "step": 160 }, { "epoch": 0.03242924528301887, "grad_norm": 0.228515625, "learning_rate": 3.241650294695481e-05, "loss": 1.3164, "step": 165 }, { "epoch": 0.03341194968553459, "grad_norm": 0.2421875, "learning_rate": 3.339882121807466e-05, "loss": 1.3071, "step": 170 }, { "epoch": 0.034394654088050314, "grad_norm": 0.228515625, "learning_rate": 3.43811394891945e-05, "loss": 1.3032, "step": 175 }, { "epoch": 0.03537735849056604, "grad_norm": 0.259765625, "learning_rate": 3.536345776031435e-05, "loss": 1.2914, "step": 180 }, { "epoch": 0.03636006289308176, "grad_norm": 0.21875, "learning_rate": 3.634577603143419e-05, "loss": 1.2698, "step": 185 }, { "epoch": 0.03734276729559748, "grad_norm": 0.263671875, "learning_rate": 3.7328094302554026e-05, "loss": 1.2984, "step": 190 }, { "epoch": 0.038325471698113206, "grad_norm": 0.326171875, "learning_rate": 3.831041257367387e-05, "loss": 1.2854, "step": 195 }, { "epoch": 0.03930817610062893, "grad_norm": 0.37109375, "learning_rate": 3.929273084479371e-05, "loss": 1.2781, "step": 200 }, { "epoch": 0.04029088050314465, "grad_norm": 0.29296875, "learning_rate": 4.027504911591356e-05, "loss": 1.285, "step": 205 }, { "epoch": 0.041273584905660375, "grad_norm": 0.283203125, "learning_rate": 4.12573673870334e-05, "loss": 1.2705, "step": 210 }, { "epoch": 0.0422562893081761, "grad_norm": 0.291015625, "learning_rate": 4.223968565815325e-05, "loss": 1.2603, "step": 215 }, { "epoch": 0.04323899371069182, "grad_norm": 0.3515625, "learning_rate": 4.3222003929273086e-05, "loss": 1.2613, "step": 220 }, { "epoch": 0.044221698113207544, "grad_norm": 0.361328125, "learning_rate": 4.4204322200392925e-05, "loss": 1.2453, "step": 225 }, { "epoch": 0.04520440251572327, "grad_norm": 0.345703125, "learning_rate": 4.518664047151277e-05, "loss": 1.2662, "step": 230 }, { "epoch": 0.04618710691823899, "grad_norm": 0.275390625, "learning_rate": 4.6168958742632616e-05, "loss": 1.2483, "step": 235 }, { "epoch": 0.04716981132075472, "grad_norm": 0.3515625, "learning_rate": 4.715127701375246e-05, "loss": 1.2561, "step": 240 }, { "epoch": 0.04815251572327044, "grad_norm": 0.37890625, "learning_rate": 4.81335952848723e-05, "loss": 1.2218, "step": 245 }, { "epoch": 0.049135220125786166, "grad_norm": 0.412109375, "learning_rate": 4.9115913555992146e-05, "loss": 1.244, "step": 250 }, { "epoch": 0.05011792452830189, "grad_norm": 0.47265625, "learning_rate": 5.0098231827111985e-05, "loss": 1.2447, "step": 255 }, { "epoch": 0.05110062893081761, "grad_norm": 0.396484375, "learning_rate": 5.108055009823183e-05, "loss": 1.2392, "step": 260 }, { "epoch": 0.052083333333333336, "grad_norm": 0.306640625, "learning_rate": 5.206286836935167e-05, "loss": 1.2235, "step": 265 }, { "epoch": 0.05306603773584906, "grad_norm": 0.4375, "learning_rate": 5.304518664047151e-05, "loss": 1.2133, "step": 270 }, { "epoch": 0.05404874213836478, "grad_norm": 0.33203125, "learning_rate": 5.4027504911591354e-05, "loss": 1.2414, "step": 275 }, { "epoch": 0.055031446540880505, "grad_norm": 0.400390625, "learning_rate": 5.5009823182711206e-05, "loss": 1.203, "step": 280 }, { "epoch": 0.05601415094339623, "grad_norm": 0.62890625, "learning_rate": 5.5992141453831045e-05, "loss": 1.2206, "step": 285 }, { "epoch": 0.05699685534591195, "grad_norm": 0.73046875, "learning_rate": 5.697445972495089e-05, "loss": 1.2233, "step": 290 }, { "epoch": 0.057979559748427674, "grad_norm": 0.32421875, "learning_rate": 5.795677799607073e-05, "loss": 1.1989, "step": 295 }, { "epoch": 0.0589622641509434, "grad_norm": 0.5859375, "learning_rate": 5.8939096267190575e-05, "loss": 1.1879, "step": 300 }, { "epoch": 0.05994496855345912, "grad_norm": 0.390625, "learning_rate": 5.9921414538310414e-05, "loss": 1.1961, "step": 305 }, { "epoch": 0.06092767295597484, "grad_norm": 0.53125, "learning_rate": 6.090373280943026e-05, "loss": 1.2021, "step": 310 }, { "epoch": 0.061910377358490566, "grad_norm": 0.3203125, "learning_rate": 6.18860510805501e-05, "loss": 1.2153, "step": 315 }, { "epoch": 0.06289308176100629, "grad_norm": 0.353515625, "learning_rate": 6.286836935166994e-05, "loss": 1.1961, "step": 320 }, { "epoch": 0.06387578616352202, "grad_norm": 0.67578125, "learning_rate": 6.385068762278978e-05, "loss": 1.1765, "step": 325 }, { "epoch": 0.06485849056603774, "grad_norm": 0.3359375, "learning_rate": 6.483300589390962e-05, "loss": 1.1868, "step": 330 }, { "epoch": 0.06584119496855347, "grad_norm": 0.5234375, "learning_rate": 6.581532416502947e-05, "loss": 1.1748, "step": 335 }, { "epoch": 0.06682389937106918, "grad_norm": 1.1328125, "learning_rate": 6.679764243614931e-05, "loss": 1.1659, "step": 340 }, { "epoch": 0.06780660377358491, "grad_norm": 0.71875, "learning_rate": 6.777996070726917e-05, "loss": 1.1648, "step": 345 }, { "epoch": 0.06878930817610063, "grad_norm": 0.353515625, "learning_rate": 6.8762278978389e-05, "loss": 1.1573, "step": 350 }, { "epoch": 0.06977201257861636, "grad_norm": 0.353515625, "learning_rate": 6.974459724950884e-05, "loss": 1.1894, "step": 355 }, { "epoch": 0.07075471698113207, "grad_norm": 0.359375, "learning_rate": 7.07269155206287e-05, "loss": 1.1512, "step": 360 }, { "epoch": 0.0717374213836478, "grad_norm": 0.349609375, "learning_rate": 7.170923379174853e-05, "loss": 1.1746, "step": 365 }, { "epoch": 0.07272012578616352, "grad_norm": 0.37109375, "learning_rate": 7.269155206286837e-05, "loss": 1.1589, "step": 370 }, { "epoch": 0.07370283018867925, "grad_norm": 0.984375, "learning_rate": 7.367387033398821e-05, "loss": 1.1567, "step": 375 }, { "epoch": 0.07468553459119497, "grad_norm": 0.4609375, "learning_rate": 7.465618860510805e-05, "loss": 1.1611, "step": 380 }, { "epoch": 0.0756682389937107, "grad_norm": 0.337890625, "learning_rate": 7.56385068762279e-05, "loss": 1.1563, "step": 385 }, { "epoch": 0.07665094339622641, "grad_norm": 0.47265625, "learning_rate": 7.662082514734774e-05, "loss": 1.1603, "step": 390 }, { "epoch": 0.07763364779874214, "grad_norm": 0.5859375, "learning_rate": 7.760314341846758e-05, "loss": 1.1534, "step": 395 }, { "epoch": 0.07861635220125786, "grad_norm": 0.349609375, "learning_rate": 7.858546168958742e-05, "loss": 1.1575, "step": 400 }, { "epoch": 0.07959905660377359, "grad_norm": 0.75390625, "learning_rate": 7.956777996070727e-05, "loss": 1.154, "step": 405 }, { "epoch": 0.0805817610062893, "grad_norm": 0.375, "learning_rate": 8.055009823182712e-05, "loss": 1.1613, "step": 410 }, { "epoch": 0.08156446540880503, "grad_norm": 0.62890625, "learning_rate": 8.153241650294696e-05, "loss": 1.1335, "step": 415 }, { "epoch": 0.08254716981132075, "grad_norm": 0.4375, "learning_rate": 8.25147347740668e-05, "loss": 1.1369, "step": 420 }, { "epoch": 0.08352987421383648, "grad_norm": 0.71484375, "learning_rate": 8.349705304518664e-05, "loss": 1.1388, "step": 425 }, { "epoch": 0.0845125786163522, "grad_norm": 0.380859375, "learning_rate": 8.44793713163065e-05, "loss": 1.1228, "step": 430 }, { "epoch": 0.08549528301886793, "grad_norm": 0.58203125, "learning_rate": 8.546168958742633e-05, "loss": 1.1552, "step": 435 }, { "epoch": 0.08647798742138364, "grad_norm": 0.388671875, "learning_rate": 8.644400785854617e-05, "loss": 1.1317, "step": 440 }, { "epoch": 0.08746069182389937, "grad_norm": 0.392578125, "learning_rate": 8.742632612966601e-05, "loss": 1.1379, "step": 445 }, { "epoch": 0.08844339622641509, "grad_norm": 0.8203125, "learning_rate": 8.840864440078585e-05, "loss": 1.1342, "step": 450 }, { "epoch": 0.08942610062893082, "grad_norm": 0.66796875, "learning_rate": 8.93909626719057e-05, "loss": 1.1154, "step": 455 }, { "epoch": 0.09040880503144653, "grad_norm": 0.52734375, "learning_rate": 9.037328094302554e-05, "loss": 1.126, "step": 460 }, { "epoch": 0.09139150943396226, "grad_norm": 0.76953125, "learning_rate": 9.135559921414538e-05, "loss": 1.1165, "step": 465 }, { "epoch": 0.09237421383647798, "grad_norm": 0.44921875, "learning_rate": 9.233791748526523e-05, "loss": 1.1268, "step": 470 }, { "epoch": 0.09335691823899371, "grad_norm": 0.447265625, "learning_rate": 9.332023575638507e-05, "loss": 1.113, "step": 475 }, { "epoch": 0.09433962264150944, "grad_norm": 0.423828125, "learning_rate": 9.430255402750492e-05, "loss": 1.0768, "step": 480 }, { "epoch": 0.09532232704402516, "grad_norm": 0.71875, "learning_rate": 9.528487229862476e-05, "loss": 1.109, "step": 485 }, { "epoch": 0.09630503144654089, "grad_norm": 0.5390625, "learning_rate": 9.62671905697446e-05, "loss": 1.1147, "step": 490 }, { "epoch": 0.0972877358490566, "grad_norm": 0.376953125, "learning_rate": 9.724950884086444e-05, "loss": 1.1194, "step": 495 }, { "epoch": 0.09827044025157233, "grad_norm": 0.353515625, "learning_rate": 9.823182711198429e-05, "loss": 1.1196, "step": 500 }, { "epoch": 0.09925314465408805, "grad_norm": 0.384765625, "learning_rate": 9.921414538310413e-05, "loss": 1.0912, "step": 505 }, { "epoch": 0.10023584905660378, "grad_norm": 0.369140625, "learning_rate": 0.00010019646365422397, "loss": 1.0998, "step": 510 }, { "epoch": 0.1012185534591195, "grad_norm": 0.87890625, "learning_rate": 0.00010117878192534382, "loss": 1.1117, "step": 515 }, { "epoch": 0.10220125786163523, "grad_norm": 0.46484375, "learning_rate": 0.00010216110019646366, "loss": 1.0974, "step": 520 }, { "epoch": 0.10318396226415094, "grad_norm": 0.396484375, "learning_rate": 0.00010314341846758351, "loss": 1.1172, "step": 525 }, { "epoch": 0.10416666666666667, "grad_norm": 0.345703125, "learning_rate": 0.00010412573673870334, "loss": 1.1188, "step": 530 }, { "epoch": 0.10514937106918239, "grad_norm": 0.36328125, "learning_rate": 0.00010510805500982319, "loss": 1.102, "step": 535 }, { "epoch": 0.10613207547169812, "grad_norm": 0.35546875, "learning_rate": 0.00010609037328094302, "loss": 1.0926, "step": 540 }, { "epoch": 0.10711477987421383, "grad_norm": 0.66796875, "learning_rate": 0.00010707269155206288, "loss": 1.0811, "step": 545 }, { "epoch": 0.10809748427672956, "grad_norm": 0.75, "learning_rate": 0.00010805500982318271, "loss": 1.1066, "step": 550 }, { "epoch": 0.10908018867924528, "grad_norm": 0.447265625, "learning_rate": 0.00010903732809430256, "loss": 1.0843, "step": 555 }, { "epoch": 0.11006289308176101, "grad_norm": 0.70703125, "learning_rate": 0.00011001964636542241, "loss": 1.0953, "step": 560 }, { "epoch": 0.11104559748427673, "grad_norm": 0.69140625, "learning_rate": 0.00011100196463654224, "loss": 1.0918, "step": 565 }, { "epoch": 0.11202830188679246, "grad_norm": 0.73046875, "learning_rate": 0.00011198428290766209, "loss": 1.0728, "step": 570 }, { "epoch": 0.11301100628930817, "grad_norm": 0.33984375, "learning_rate": 0.00011296660117878193, "loss": 1.0745, "step": 575 }, { "epoch": 0.1139937106918239, "grad_norm": 0.431640625, "learning_rate": 0.00011394891944990178, "loss": 1.0784, "step": 580 }, { "epoch": 0.11497641509433962, "grad_norm": 0.392578125, "learning_rate": 0.00011493123772102161, "loss": 1.0865, "step": 585 }, { "epoch": 0.11595911949685535, "grad_norm": 0.330078125, "learning_rate": 0.00011591355599214146, "loss": 1.0642, "step": 590 }, { "epoch": 0.11694182389937106, "grad_norm": 0.3828125, "learning_rate": 0.0001168958742632613, "loss": 1.0752, "step": 595 }, { "epoch": 0.1179245283018868, "grad_norm": 0.359375, "learning_rate": 0.00011787819253438115, "loss": 1.0718, "step": 600 }, { "epoch": 0.11890723270440251, "grad_norm": 0.41015625, "learning_rate": 0.00011886051080550098, "loss": 1.082, "step": 605 }, { "epoch": 0.11988993710691824, "grad_norm": 0.37890625, "learning_rate": 0.00011984282907662083, "loss": 1.0514, "step": 610 }, { "epoch": 0.12087264150943396, "grad_norm": 0.8125, "learning_rate": 0.00012082514734774067, "loss": 1.0718, "step": 615 }, { "epoch": 0.12185534591194969, "grad_norm": 0.60546875, "learning_rate": 0.00012180746561886052, "loss": 1.0721, "step": 620 }, { "epoch": 0.1228380503144654, "grad_norm": 0.33984375, "learning_rate": 0.00012278978388998037, "loss": 1.087, "step": 625 }, { "epoch": 0.12382075471698113, "grad_norm": 0.333984375, "learning_rate": 0.0001237721021611002, "loss": 1.0613, "step": 630 }, { "epoch": 0.12480345911949685, "grad_norm": 0.3984375, "learning_rate": 0.00012475442043222005, "loss": 1.0848, "step": 635 }, { "epoch": 0.12578616352201258, "grad_norm": 0.7890625, "learning_rate": 0.0001257367387033399, "loss": 1.076, "step": 640 }, { "epoch": 0.1267688679245283, "grad_norm": 0.376953125, "learning_rate": 0.00012671905697445973, "loss": 1.0684, "step": 645 }, { "epoch": 0.12775157232704404, "grad_norm": 0.35546875, "learning_rate": 0.00012770137524557957, "loss": 1.0788, "step": 650 }, { "epoch": 0.12873427672955975, "grad_norm": 0.40234375, "learning_rate": 0.00012868369351669943, "loss": 1.0672, "step": 655 }, { "epoch": 0.12971698113207547, "grad_norm": 0.5234375, "learning_rate": 0.00012966601178781924, "loss": 1.0795, "step": 660 }, { "epoch": 0.1306996855345912, "grad_norm": 0.326171875, "learning_rate": 0.0001306483300589391, "loss": 1.0481, "step": 665 }, { "epoch": 0.13168238993710693, "grad_norm": 0.56640625, "learning_rate": 0.00013163064833005895, "loss": 1.0537, "step": 670 }, { "epoch": 0.13266509433962265, "grad_norm": 0.33203125, "learning_rate": 0.0001326129666011788, "loss": 1.0627, "step": 675 }, { "epoch": 0.13364779874213836, "grad_norm": 0.322265625, "learning_rate": 0.00013359528487229863, "loss": 1.0436, "step": 680 }, { "epoch": 0.13463050314465408, "grad_norm": 0.78125, "learning_rate": 0.00013457760314341847, "loss": 1.048, "step": 685 }, { "epoch": 0.13561320754716982, "grad_norm": 0.5390625, "learning_rate": 0.00013555992141453833, "loss": 1.0611, "step": 690 }, { "epoch": 0.13659591194968554, "grad_norm": 0.5625, "learning_rate": 0.00013654223968565817, "loss": 1.0509, "step": 695 }, { "epoch": 0.13757861635220126, "grad_norm": 0.39453125, "learning_rate": 0.000137524557956778, "loss": 1.054, "step": 700 }, { "epoch": 0.13856132075471697, "grad_norm": 0.369140625, "learning_rate": 0.00013850687622789785, "loss": 1.0321, "step": 705 }, { "epoch": 0.13954402515723272, "grad_norm": 0.5859375, "learning_rate": 0.0001394891944990177, "loss": 1.0389, "step": 710 }, { "epoch": 0.14052672955974843, "grad_norm": 0.390625, "learning_rate": 0.00014047151277013753, "loss": 1.0537, "step": 715 }, { "epoch": 0.14150943396226415, "grad_norm": 0.66015625, "learning_rate": 0.0001414538310412574, "loss": 1.0514, "step": 720 }, { "epoch": 0.14249213836477986, "grad_norm": 0.32421875, "learning_rate": 0.0001424361493123772, "loss": 1.0085, "step": 725 }, { "epoch": 0.1434748427672956, "grad_norm": 0.478515625, "learning_rate": 0.00014341846758349707, "loss": 1.0316, "step": 730 }, { "epoch": 0.14445754716981132, "grad_norm": 0.54296875, "learning_rate": 0.00014440078585461688, "loss": 1.0503, "step": 735 }, { "epoch": 0.14544025157232704, "grad_norm": 0.310546875, "learning_rate": 0.00014538310412573675, "loss": 1.0632, "step": 740 }, { "epoch": 0.14642295597484276, "grad_norm": 0.427734375, "learning_rate": 0.0001463654223968566, "loss": 1.0317, "step": 745 }, { "epoch": 0.1474056603773585, "grad_norm": 0.365234375, "learning_rate": 0.00014734774066797642, "loss": 1.0498, "step": 750 }, { "epoch": 0.14838836477987422, "grad_norm": 0.37890625, "learning_rate": 0.0001483300589390963, "loss": 1.0312, "step": 755 }, { "epoch": 0.14937106918238993, "grad_norm": 0.484375, "learning_rate": 0.0001493123772102161, "loss": 1.0421, "step": 760 }, { "epoch": 0.15035377358490565, "grad_norm": 0.306640625, "learning_rate": 0.00015029469548133597, "loss": 1.0416, "step": 765 }, { "epoch": 0.1513364779874214, "grad_norm": 0.314453125, "learning_rate": 0.0001512770137524558, "loss": 1.0296, "step": 770 }, { "epoch": 0.1523191823899371, "grad_norm": 0.875, "learning_rate": 0.00015225933202357565, "loss": 1.0351, "step": 775 }, { "epoch": 0.15330188679245282, "grad_norm": 0.51171875, "learning_rate": 0.00015324165029469548, "loss": 1.0265, "step": 780 }, { "epoch": 0.15428459119496854, "grad_norm": 0.3046875, "learning_rate": 0.00015422396856581532, "loss": 1.0243, "step": 785 }, { "epoch": 0.15526729559748428, "grad_norm": 0.427734375, "learning_rate": 0.00015520628683693516, "loss": 1.0341, "step": 790 }, { "epoch": 0.15625, "grad_norm": 0.38671875, "learning_rate": 0.00015618860510805503, "loss": 1.0214, "step": 795 }, { "epoch": 0.15723270440251572, "grad_norm": 0.470703125, "learning_rate": 0.00015717092337917484, "loss": 1.0199, "step": 800 }, { "epoch": 0.15821540880503146, "grad_norm": 0.322265625, "learning_rate": 0.0001581532416502947, "loss": 1.0078, "step": 805 }, { "epoch": 0.15919811320754718, "grad_norm": 0.298828125, "learning_rate": 0.00015913555992141455, "loss": 1.0247, "step": 810 }, { "epoch": 0.1601808176100629, "grad_norm": 0.337890625, "learning_rate": 0.00016011787819253438, "loss": 1.0397, "step": 815 }, { "epoch": 0.1611635220125786, "grad_norm": 0.390625, "learning_rate": 0.00016110019646365425, "loss": 1.0195, "step": 820 }, { "epoch": 0.16214622641509435, "grad_norm": 0.4140625, "learning_rate": 0.00016208251473477406, "loss": 1.0129, "step": 825 }, { "epoch": 0.16312893081761007, "grad_norm": 0.341796875, "learning_rate": 0.00016306483300589393, "loss": 1.0171, "step": 830 }, { "epoch": 0.16411163522012578, "grad_norm": 0.333984375, "learning_rate": 0.00016404715127701377, "loss": 1.0045, "step": 835 }, { "epoch": 0.1650943396226415, "grad_norm": 0.361328125, "learning_rate": 0.0001650294695481336, "loss": 0.9911, "step": 840 }, { "epoch": 0.16607704402515724, "grad_norm": 0.478515625, "learning_rate": 0.00016601178781925344, "loss": 0.9963, "step": 845 }, { "epoch": 0.16705974842767296, "grad_norm": 0.625, "learning_rate": 0.00016699410609037328, "loss": 1.0181, "step": 850 }, { "epoch": 0.16804245283018868, "grad_norm": 0.314453125, "learning_rate": 0.00016797642436149312, "loss": 1.0027, "step": 855 }, { "epoch": 0.1690251572327044, "grad_norm": 0.51171875, "learning_rate": 0.000168958742632613, "loss": 1.0185, "step": 860 }, { "epoch": 0.17000786163522014, "grad_norm": 0.431640625, "learning_rate": 0.0001699410609037328, "loss": 1.0021, "step": 865 }, { "epoch": 0.17099056603773585, "grad_norm": 0.353515625, "learning_rate": 0.00017092337917485267, "loss": 1.0071, "step": 870 }, { "epoch": 0.17197327044025157, "grad_norm": 0.453125, "learning_rate": 0.0001719056974459725, "loss": 1.0065, "step": 875 }, { "epoch": 0.17295597484276728, "grad_norm": 0.5078125, "learning_rate": 0.00017288801571709234, "loss": 1.0237, "step": 880 }, { "epoch": 0.17393867924528303, "grad_norm": 0.375, "learning_rate": 0.0001738703339882122, "loss": 1.0162, "step": 885 }, { "epoch": 0.17492138364779874, "grad_norm": 0.3671875, "learning_rate": 0.00017485265225933202, "loss": 0.9945, "step": 890 }, { "epoch": 0.17590408805031446, "grad_norm": 0.404296875, "learning_rate": 0.0001758349705304519, "loss": 1.016, "step": 895 }, { "epoch": 0.17688679245283018, "grad_norm": 0.283203125, "learning_rate": 0.0001768172888015717, "loss": 1.0337, "step": 900 }, { "epoch": 0.17786949685534592, "grad_norm": 0.310546875, "learning_rate": 0.00017779960707269156, "loss": 1.0119, "step": 905 }, { "epoch": 0.17885220125786164, "grad_norm": 0.4609375, "learning_rate": 0.0001787819253438114, "loss": 1.0119, "step": 910 }, { "epoch": 0.17983490566037735, "grad_norm": 0.296875, "learning_rate": 0.00017976424361493124, "loss": 0.9935, "step": 915 }, { "epoch": 0.18081761006289307, "grad_norm": 0.33203125, "learning_rate": 0.00018074656188605108, "loss": 1.0026, "step": 920 }, { "epoch": 0.1818003144654088, "grad_norm": 0.369140625, "learning_rate": 0.00018172888015717092, "loss": 0.9855, "step": 925 }, { "epoch": 0.18278301886792453, "grad_norm": 0.298828125, "learning_rate": 0.00018271119842829076, "loss": 1.0135, "step": 930 }, { "epoch": 0.18376572327044025, "grad_norm": 0.30078125, "learning_rate": 0.00018369351669941062, "loss": 1.0028, "step": 935 }, { "epoch": 0.18474842767295596, "grad_norm": 0.375, "learning_rate": 0.00018467583497053046, "loss": 1.0078, "step": 940 }, { "epoch": 0.1857311320754717, "grad_norm": 0.49609375, "learning_rate": 0.0001856581532416503, "loss": 0.9931, "step": 945 }, { "epoch": 0.18671383647798742, "grad_norm": 0.455078125, "learning_rate": 0.00018664047151277014, "loss": 0.9898, "step": 950 }, { "epoch": 0.18769654088050314, "grad_norm": 0.39453125, "learning_rate": 0.00018762278978388998, "loss": 0.9982, "step": 955 }, { "epoch": 0.18867924528301888, "grad_norm": 0.291015625, "learning_rate": 0.00018860510805500985, "loss": 0.9974, "step": 960 }, { "epoch": 0.1896619496855346, "grad_norm": 0.3515625, "learning_rate": 0.00018958742632612966, "loss": 0.9886, "step": 965 }, { "epoch": 0.1906446540880503, "grad_norm": 0.341796875, "learning_rate": 0.00019056974459724952, "loss": 0.9926, "step": 970 }, { "epoch": 0.19162735849056603, "grad_norm": 0.341796875, "learning_rate": 0.00019155206286836936, "loss": 1.0024, "step": 975 }, { "epoch": 0.19261006289308177, "grad_norm": 0.30078125, "learning_rate": 0.0001925343811394892, "loss": 0.9963, "step": 980 }, { "epoch": 0.1935927672955975, "grad_norm": 0.427734375, "learning_rate": 0.00019351669941060904, "loss": 1.0055, "step": 985 }, { "epoch": 0.1945754716981132, "grad_norm": 0.453125, "learning_rate": 0.00019449901768172888, "loss": 0.9818, "step": 990 }, { "epoch": 0.19555817610062892, "grad_norm": 0.53125, "learning_rate": 0.00019548133595284875, "loss": 1.0134, "step": 995 }, { "epoch": 0.19654088050314467, "grad_norm": 0.35546875, "learning_rate": 0.00019646365422396858, "loss": 0.9932, "step": 1000 }, { "epoch": 0.19752358490566038, "grad_norm": 0.31640625, "learning_rate": 0.00019744597249508842, "loss": 0.9802, "step": 1005 }, { "epoch": 0.1985062893081761, "grad_norm": 0.310546875, "learning_rate": 0.00019842829076620826, "loss": 0.99, "step": 1010 }, { "epoch": 0.19948899371069181, "grad_norm": 0.353515625, "learning_rate": 0.0001994106090373281, "loss": 0.999, "step": 1015 }, { "epoch": 0.20047169811320756, "grad_norm": 0.515625, "learning_rate": 0.00019999997646422815, "loss": 0.9685, "step": 1020 }, { "epoch": 0.20145440251572327, "grad_norm": 0.326171875, "learning_rate": 0.00019999971168692198, "loss": 0.9788, "step": 1025 }, { "epoch": 0.202437106918239, "grad_norm": 0.294921875, "learning_rate": 0.00019999915271337634, "loss": 0.9736, "step": 1030 }, { "epoch": 0.2034198113207547, "grad_norm": 0.29296875, "learning_rate": 0.00019999829954523573, "loss": 0.9811, "step": 1035 }, { "epoch": 0.20440251572327045, "grad_norm": 0.318359375, "learning_rate": 0.00019999715218501016, "loss": 0.9989, "step": 1040 }, { "epoch": 0.20538522012578617, "grad_norm": 0.3125, "learning_rate": 0.00019999571063607512, "loss": 0.976, "step": 1045 }, { "epoch": 0.20636792452830188, "grad_norm": 0.5078125, "learning_rate": 0.00019999397490267162, "loss": 0.9551, "step": 1050 }, { "epoch": 0.2073506289308176, "grad_norm": 0.298828125, "learning_rate": 0.00019999194498990613, "loss": 0.9894, "step": 1055 }, { "epoch": 0.20833333333333334, "grad_norm": 0.314453125, "learning_rate": 0.0001999896209037506, "loss": 0.9754, "step": 1060 }, { "epoch": 0.20931603773584906, "grad_norm": 0.4921875, "learning_rate": 0.00019998700265104238, "loss": 0.961, "step": 1065 }, { "epoch": 0.21029874213836477, "grad_norm": 0.51953125, "learning_rate": 0.00019998409023948432, "loss": 0.9761, "step": 1070 }, { "epoch": 0.2112814465408805, "grad_norm": 0.55078125, "learning_rate": 0.00019998088367764467, "loss": 0.9644, "step": 1075 }, { "epoch": 0.21226415094339623, "grad_norm": 0.458984375, "learning_rate": 0.00019997738297495703, "loss": 0.9669, "step": 1080 }, { "epoch": 0.21324685534591195, "grad_norm": 0.3515625, "learning_rate": 0.00019997358814172035, "loss": 0.9543, "step": 1085 }, { "epoch": 0.21422955974842767, "grad_norm": 0.43359375, "learning_rate": 0.00019996949918909897, "loss": 0.9814, "step": 1090 }, { "epoch": 0.21521226415094338, "grad_norm": 0.390625, "learning_rate": 0.0001999651161291224, "loss": 0.9599, "step": 1095 }, { "epoch": 0.21619496855345913, "grad_norm": 0.359375, "learning_rate": 0.00019996043897468552, "loss": 0.9461, "step": 1100 }, { "epoch": 0.21717767295597484, "grad_norm": 0.44921875, "learning_rate": 0.00019995546773954835, "loss": 0.9528, "step": 1105 }, { "epoch": 0.21816037735849056, "grad_norm": 0.361328125, "learning_rate": 0.00019995020243833615, "loss": 0.9865, "step": 1110 }, { "epoch": 0.2191430817610063, "grad_norm": 0.3203125, "learning_rate": 0.00019994464308653926, "loss": 0.9853, "step": 1115 }, { "epoch": 0.22012578616352202, "grad_norm": 0.345703125, "learning_rate": 0.00019993878970051316, "loss": 0.9863, "step": 1120 }, { "epoch": 0.22110849056603774, "grad_norm": 0.4609375, "learning_rate": 0.00019993264229747833, "loss": 0.9778, "step": 1125 }, { "epoch": 0.22209119496855345, "grad_norm": 0.30859375, "learning_rate": 0.0001999262008955202, "loss": 0.9896, "step": 1130 }, { "epoch": 0.2230738993710692, "grad_norm": 0.3515625, "learning_rate": 0.00019991946551358925, "loss": 0.978, "step": 1135 }, { "epoch": 0.2240566037735849, "grad_norm": 0.396484375, "learning_rate": 0.00019991243617150078, "loss": 0.9714, "step": 1140 }, { "epoch": 0.22503930817610063, "grad_norm": 0.357421875, "learning_rate": 0.00019990511288993485, "loss": 0.9976, "step": 1145 }, { "epoch": 0.22602201257861634, "grad_norm": 0.2734375, "learning_rate": 0.00019989749569043638, "loss": 0.9473, "step": 1150 }, { "epoch": 0.2270047169811321, "grad_norm": 0.361328125, "learning_rate": 0.00019988958459541501, "loss": 0.9745, "step": 1155 }, { "epoch": 0.2279874213836478, "grad_norm": 0.33203125, "learning_rate": 0.00019988137962814482, "loss": 0.9726, "step": 1160 }, { "epoch": 0.22897012578616352, "grad_norm": 0.330078125, "learning_rate": 0.00019987288081276468, "loss": 0.9601, "step": 1165 }, { "epoch": 0.22995283018867924, "grad_norm": 0.51953125, "learning_rate": 0.0001998640881742778, "loss": 0.9626, "step": 1170 }, { "epoch": 0.23093553459119498, "grad_norm": 0.28125, "learning_rate": 0.00019985500173855196, "loss": 0.929, "step": 1175 }, { "epoch": 0.2319182389937107, "grad_norm": 0.404296875, "learning_rate": 0.00019984562153231908, "loss": 0.9573, "step": 1180 }, { "epoch": 0.2329009433962264, "grad_norm": 0.279296875, "learning_rate": 0.00019983594758317551, "loss": 0.9549, "step": 1185 }, { "epoch": 0.23388364779874213, "grad_norm": 0.328125, "learning_rate": 0.00019982597991958172, "loss": 0.9626, "step": 1190 }, { "epoch": 0.23486635220125787, "grad_norm": 0.3046875, "learning_rate": 0.0001998157185708623, "loss": 0.9584, "step": 1195 }, { "epoch": 0.2358490566037736, "grad_norm": 0.28515625, "learning_rate": 0.00019980516356720576, "loss": 0.9428, "step": 1200 }, { "epoch": 0.2368317610062893, "grad_norm": 0.76171875, "learning_rate": 0.00019979431493966473, "loss": 0.9302, "step": 1205 }, { "epoch": 0.23781446540880502, "grad_norm": 0.283203125, "learning_rate": 0.0001997831727201555, "loss": 0.9765, "step": 1210 }, { "epoch": 0.23879716981132076, "grad_norm": 0.30859375, "learning_rate": 0.00019977173694145812, "loss": 0.9454, "step": 1215 }, { "epoch": 0.23977987421383648, "grad_norm": 0.4765625, "learning_rate": 0.00019976000763721635, "loss": 0.9604, "step": 1220 }, { "epoch": 0.2407625786163522, "grad_norm": 0.484375, "learning_rate": 0.0001997479848419375, "loss": 0.9584, "step": 1225 }, { "epoch": 0.2417452830188679, "grad_norm": 0.369140625, "learning_rate": 0.00019973566859099226, "loss": 0.9473, "step": 1230 }, { "epoch": 0.24272798742138366, "grad_norm": 0.294921875, "learning_rate": 0.00019972305892061466, "loss": 0.9486, "step": 1235 }, { "epoch": 0.24371069182389937, "grad_norm": 0.376953125, "learning_rate": 0.00019971015586790197, "loss": 0.9477, "step": 1240 }, { "epoch": 0.2446933962264151, "grad_norm": 0.27734375, "learning_rate": 0.00019969695947081464, "loss": 0.9394, "step": 1245 }, { "epoch": 0.2456761006289308, "grad_norm": 0.43359375, "learning_rate": 0.00019968346976817608, "loss": 0.9466, "step": 1250 }, { "epoch": 0.24665880503144655, "grad_norm": 0.484375, "learning_rate": 0.00019966968679967256, "loss": 0.9538, "step": 1255 }, { "epoch": 0.24764150943396226, "grad_norm": 0.37109375, "learning_rate": 0.0001996556106058532, "loss": 0.9581, "step": 1260 }, { "epoch": 0.24862421383647798, "grad_norm": 0.3671875, "learning_rate": 0.00019964124122812975, "loss": 0.9568, "step": 1265 }, { "epoch": 0.2496069182389937, "grad_norm": 0.34375, "learning_rate": 0.0001996265787087765, "loss": 0.9465, "step": 1270 }, { "epoch": 0.2505896226415094, "grad_norm": 0.5078125, "learning_rate": 0.00019961162309093018, "loss": 0.9562, "step": 1275 }, { "epoch": 0.25157232704402516, "grad_norm": 0.33984375, "learning_rate": 0.00019959637441858977, "loss": 0.9467, "step": 1280 }, { "epoch": 0.2525550314465409, "grad_norm": 0.373046875, "learning_rate": 0.00019958083273661638, "loss": 0.9524, "step": 1285 }, { "epoch": 0.2535377358490566, "grad_norm": 0.4453125, "learning_rate": 0.00019956499809073322, "loss": 0.9494, "step": 1290 }, { "epoch": 0.25452044025157233, "grad_norm": 0.3203125, "learning_rate": 0.00019954887052752536, "loss": 0.9512, "step": 1295 }, { "epoch": 0.2555031446540881, "grad_norm": 0.28125, "learning_rate": 0.0001995324500944396, "loss": 0.9496, "step": 1300 }, { "epoch": 0.25648584905660377, "grad_norm": 0.279296875, "learning_rate": 0.0001995157368397844, "loss": 0.9479, "step": 1305 }, { "epoch": 0.2574685534591195, "grad_norm": 0.291015625, "learning_rate": 0.00019949873081272966, "loss": 0.955, "step": 1310 }, { "epoch": 0.2584512578616352, "grad_norm": 0.27734375, "learning_rate": 0.0001994814320633066, "loss": 0.9335, "step": 1315 }, { "epoch": 0.25943396226415094, "grad_norm": 0.4375, "learning_rate": 0.00019946384064240767, "loss": 0.9458, "step": 1320 }, { "epoch": 0.2604166666666667, "grad_norm": 0.55078125, "learning_rate": 0.00019944595660178628, "loss": 0.9421, "step": 1325 }, { "epoch": 0.2613993710691824, "grad_norm": 0.330078125, "learning_rate": 0.0001994277799940568, "loss": 0.9347, "step": 1330 }, { "epoch": 0.2623820754716981, "grad_norm": 0.39453125, "learning_rate": 0.00019940931087269423, "loss": 0.933, "step": 1335 }, { "epoch": 0.26336477987421386, "grad_norm": 0.357421875, "learning_rate": 0.00019939054929203422, "loss": 0.9491, "step": 1340 }, { "epoch": 0.26434748427672955, "grad_norm": 0.310546875, "learning_rate": 0.00019937149530727282, "loss": 0.9458, "step": 1345 }, { "epoch": 0.2653301886792453, "grad_norm": 0.34375, "learning_rate": 0.00019935214897446622, "loss": 0.9469, "step": 1350 }, { "epoch": 0.266312893081761, "grad_norm": 0.287109375, "learning_rate": 0.00019933251035053083, "loss": 0.9427, "step": 1355 }, { "epoch": 0.2672955974842767, "grad_norm": 0.2734375, "learning_rate": 0.00019931257949324288, "loss": 0.9477, "step": 1360 }, { "epoch": 0.26827830188679247, "grad_norm": 0.291015625, "learning_rate": 0.00019929235646123843, "loss": 0.9441, "step": 1365 }, { "epoch": 0.26926100628930816, "grad_norm": 0.291015625, "learning_rate": 0.00019927184131401297, "loss": 0.9471, "step": 1370 }, { "epoch": 0.2702437106918239, "grad_norm": 0.291015625, "learning_rate": 0.0001992510341119215, "loss": 0.9538, "step": 1375 }, { "epoch": 0.27122641509433965, "grad_norm": 0.453125, "learning_rate": 0.0001992299349161782, "loss": 0.9398, "step": 1380 }, { "epoch": 0.27220911949685533, "grad_norm": 0.43359375, "learning_rate": 0.00019920854378885632, "loss": 0.9313, "step": 1385 }, { "epoch": 0.2731918238993711, "grad_norm": 0.296875, "learning_rate": 0.00019918686079288788, "loss": 0.9312, "step": 1390 }, { "epoch": 0.27417452830188677, "grad_norm": 0.322265625, "learning_rate": 0.00019916488599206367, "loss": 0.9434, "step": 1395 }, { "epoch": 0.2751572327044025, "grad_norm": 0.31640625, "learning_rate": 0.0001991426194510329, "loss": 0.9418, "step": 1400 }, { "epoch": 0.27613993710691825, "grad_norm": 0.29296875, "learning_rate": 0.00019912006123530305, "loss": 0.9366, "step": 1405 }, { "epoch": 0.27712264150943394, "grad_norm": 0.390625, "learning_rate": 0.00019909721141123975, "loss": 0.9275, "step": 1410 }, { "epoch": 0.2781053459119497, "grad_norm": 0.29296875, "learning_rate": 0.00019907407004606656, "loss": 0.9284, "step": 1415 }, { "epoch": 0.27908805031446543, "grad_norm": 0.361328125, "learning_rate": 0.0001990506372078647, "loss": 0.9398, "step": 1420 }, { "epoch": 0.2800707547169811, "grad_norm": 0.31640625, "learning_rate": 0.00019902691296557284, "loss": 0.9489, "step": 1425 }, { "epoch": 0.28105345911949686, "grad_norm": 0.388671875, "learning_rate": 0.00019900289738898703, "loss": 0.9422, "step": 1430 }, { "epoch": 0.2820361635220126, "grad_norm": 0.404296875, "learning_rate": 0.0001989785905487604, "loss": 0.9273, "step": 1435 }, { "epoch": 0.2830188679245283, "grad_norm": 0.50390625, "learning_rate": 0.000198953992516403, "loss": 0.9282, "step": 1440 }, { "epoch": 0.28400157232704404, "grad_norm": 0.318359375, "learning_rate": 0.0001989291033642815, "loss": 0.9269, "step": 1445 }, { "epoch": 0.2849842767295597, "grad_norm": 0.447265625, "learning_rate": 0.00019890392316561904, "loss": 0.9296, "step": 1450 }, { "epoch": 0.28596698113207547, "grad_norm": 0.3046875, "learning_rate": 0.00019887845199449504, "loss": 0.9268, "step": 1455 }, { "epoch": 0.2869496855345912, "grad_norm": 0.419921875, "learning_rate": 0.00019885268992584496, "loss": 0.9222, "step": 1460 }, { "epoch": 0.2879323899371069, "grad_norm": 0.318359375, "learning_rate": 0.00019882663703546004, "loss": 0.9517, "step": 1465 }, { "epoch": 0.28891509433962265, "grad_norm": 0.287109375, "learning_rate": 0.00019880029339998715, "loss": 0.9219, "step": 1470 }, { "epoch": 0.2898977987421384, "grad_norm": 0.28125, "learning_rate": 0.0001987736590969285, "loss": 0.898, "step": 1475 }, { "epoch": 0.2908805031446541, "grad_norm": 0.3125, "learning_rate": 0.0001987467342046414, "loss": 0.922, "step": 1480 }, { "epoch": 0.2918632075471698, "grad_norm": 0.29296875, "learning_rate": 0.0001987195188023381, "loss": 0.9049, "step": 1485 }, { "epoch": 0.2928459119496855, "grad_norm": 0.33984375, "learning_rate": 0.00019869201297008552, "loss": 0.9283, "step": 1490 }, { "epoch": 0.29382861635220126, "grad_norm": 0.37109375, "learning_rate": 0.00019866421678880507, "loss": 0.9222, "step": 1495 }, { "epoch": 0.294811320754717, "grad_norm": 0.353515625, "learning_rate": 0.00019863613034027224, "loss": 0.9279, "step": 1500 }, { "epoch": 0.2957940251572327, "grad_norm": 0.6484375, "learning_rate": 0.0001986077537071166, "loss": 0.9152, "step": 1505 }, { "epoch": 0.29677672955974843, "grad_norm": 0.396484375, "learning_rate": 0.00019857908697282133, "loss": 0.9122, "step": 1510 }, { "epoch": 0.2977594339622642, "grad_norm": 0.44140625, "learning_rate": 0.00019855013022172316, "loss": 0.9351, "step": 1515 }, { "epoch": 0.29874213836477986, "grad_norm": 0.357421875, "learning_rate": 0.000198520883539012, "loss": 0.9285, "step": 1520 }, { "epoch": 0.2997248427672956, "grad_norm": 0.333984375, "learning_rate": 0.00019849134701073072, "loss": 0.9257, "step": 1525 }, { "epoch": 0.3007075471698113, "grad_norm": 0.291015625, "learning_rate": 0.00019846152072377495, "loss": 0.9141, "step": 1530 }, { "epoch": 0.30169025157232704, "grad_norm": 0.462890625, "learning_rate": 0.00019843140476589276, "loss": 0.9108, "step": 1535 }, { "epoch": 0.3026729559748428, "grad_norm": 0.33203125, "learning_rate": 0.00019840099922568437, "loss": 0.9039, "step": 1540 }, { "epoch": 0.30365566037735847, "grad_norm": 0.421875, "learning_rate": 0.00019837030419260208, "loss": 0.9158, "step": 1545 }, { "epoch": 0.3046383647798742, "grad_norm": 0.302734375, "learning_rate": 0.0001983393197569497, "loss": 0.9373, "step": 1550 }, { "epoch": 0.30562106918238996, "grad_norm": 0.380859375, "learning_rate": 0.0001983080460098826, "loss": 0.9133, "step": 1555 }, { "epoch": 0.30660377358490565, "grad_norm": 0.30078125, "learning_rate": 0.0001982764830434072, "loss": 0.9152, "step": 1560 }, { "epoch": 0.3075864779874214, "grad_norm": 0.37890625, "learning_rate": 0.00019824463095038082, "loss": 0.9218, "step": 1565 }, { "epoch": 0.3085691823899371, "grad_norm": 0.41015625, "learning_rate": 0.00019821248982451143, "loss": 0.9041, "step": 1570 }, { "epoch": 0.3095518867924528, "grad_norm": 0.408203125, "learning_rate": 0.00019818005976035723, "loss": 0.9134, "step": 1575 }, { "epoch": 0.31053459119496857, "grad_norm": 0.29296875, "learning_rate": 0.00019814734085332657, "loss": 0.9043, "step": 1580 }, { "epoch": 0.31151729559748426, "grad_norm": 0.28125, "learning_rate": 0.00019811433319967753, "loss": 0.9292, "step": 1585 }, { "epoch": 0.3125, "grad_norm": 0.287109375, "learning_rate": 0.00019808103689651762, "loss": 0.9138, "step": 1590 }, { "epoch": 0.31348270440251574, "grad_norm": 0.28515625, "learning_rate": 0.00019804745204180364, "loss": 0.9029, "step": 1595 }, { "epoch": 0.31446540880503143, "grad_norm": 0.294921875, "learning_rate": 0.00019801357873434121, "loss": 0.9003, "step": 1600 }, { "epoch": 0.3154481132075472, "grad_norm": 0.30078125, "learning_rate": 0.00019797941707378462, "loss": 0.9269, "step": 1605 }, { "epoch": 0.3164308176100629, "grad_norm": 0.6796875, "learning_rate": 0.00019794496716063652, "loss": 0.932, "step": 1610 }, { "epoch": 0.3174135220125786, "grad_norm": 0.3671875, "learning_rate": 0.00019791022909624751, "loss": 0.9096, "step": 1615 }, { "epoch": 0.31839622641509435, "grad_norm": 0.294921875, "learning_rate": 0.00019787520298281602, "loss": 0.8985, "step": 1620 }, { "epoch": 0.31937893081761004, "grad_norm": 0.291015625, "learning_rate": 0.0001978398889233878, "loss": 0.9244, "step": 1625 }, { "epoch": 0.3203616352201258, "grad_norm": 0.2890625, "learning_rate": 0.0001978042870218558, "loss": 0.9118, "step": 1630 }, { "epoch": 0.32134433962264153, "grad_norm": 0.51171875, "learning_rate": 0.00019776839738295978, "loss": 0.8763, "step": 1635 }, { "epoch": 0.3223270440251572, "grad_norm": 0.392578125, "learning_rate": 0.00019773222011228598, "loss": 0.887, "step": 1640 }, { "epoch": 0.32330974842767296, "grad_norm": 0.5234375, "learning_rate": 0.00019769575531626695, "loss": 0.906, "step": 1645 }, { "epoch": 0.3242924528301887, "grad_norm": 0.296875, "learning_rate": 0.000197659003102181, "loss": 0.9168, "step": 1650 }, { "epoch": 0.3252751572327044, "grad_norm": 0.3125, "learning_rate": 0.00019762196357815207, "loss": 0.8751, "step": 1655 }, { "epoch": 0.32625786163522014, "grad_norm": 0.369140625, "learning_rate": 0.0001975846368531494, "loss": 0.9358, "step": 1660 }, { "epoch": 0.3272405660377358, "grad_norm": 0.265625, "learning_rate": 0.00019754702303698712, "loss": 0.9009, "step": 1665 }, { "epoch": 0.32822327044025157, "grad_norm": 0.435546875, "learning_rate": 0.00019750912224032397, "loss": 0.9076, "step": 1670 }, { "epoch": 0.3292059748427673, "grad_norm": 0.283203125, "learning_rate": 0.00019747093457466296, "loss": 0.9076, "step": 1675 }, { "epoch": 0.330188679245283, "grad_norm": 0.5078125, "learning_rate": 0.00019743246015235116, "loss": 0.8957, "step": 1680 }, { "epoch": 0.33117138364779874, "grad_norm": 0.29296875, "learning_rate": 0.00019739369908657915, "loss": 0.9259, "step": 1685 }, { "epoch": 0.3321540880503145, "grad_norm": 0.349609375, "learning_rate": 0.00019735465149138084, "loss": 0.907, "step": 1690 }, { "epoch": 0.3331367924528302, "grad_norm": 0.275390625, "learning_rate": 0.00019731531748163318, "loss": 0.8991, "step": 1695 }, { "epoch": 0.3341194968553459, "grad_norm": 0.283203125, "learning_rate": 0.0001972756971730556, "loss": 0.9232, "step": 1700 }, { "epoch": 0.3351022012578616, "grad_norm": 0.35546875, "learning_rate": 0.00019723579068220998, "loss": 0.9083, "step": 1705 }, { "epoch": 0.33608490566037735, "grad_norm": 0.287109375, "learning_rate": 0.0001971955981265, "loss": 0.9031, "step": 1710 }, { "epoch": 0.3370676100628931, "grad_norm": 0.291015625, "learning_rate": 0.000197155119624171, "loss": 0.8836, "step": 1715 }, { "epoch": 0.3380503144654088, "grad_norm": 0.28515625, "learning_rate": 0.00019711435529430954, "loss": 0.8933, "step": 1720 }, { "epoch": 0.33903301886792453, "grad_norm": 0.35546875, "learning_rate": 0.0001970733052568431, "loss": 0.9048, "step": 1725 }, { "epoch": 0.3400157232704403, "grad_norm": 0.29296875, "learning_rate": 0.00019703196963253972, "loss": 0.9183, "step": 1730 }, { "epoch": 0.34099842767295596, "grad_norm": 0.349609375, "learning_rate": 0.00019699034854300763, "loss": 0.8875, "step": 1735 }, { "epoch": 0.3419811320754717, "grad_norm": 0.2890625, "learning_rate": 0.00019694844211069477, "loss": 0.9054, "step": 1740 }, { "epoch": 0.3429638364779874, "grad_norm": 0.279296875, "learning_rate": 0.0001969062504588887, "loss": 0.8919, "step": 1745 }, { "epoch": 0.34394654088050314, "grad_norm": 0.2890625, "learning_rate": 0.00019686377371171604, "loss": 0.8919, "step": 1750 }, { "epoch": 0.3449292452830189, "grad_norm": 0.2890625, "learning_rate": 0.0001968210119941421, "loss": 0.9133, "step": 1755 }, { "epoch": 0.34591194968553457, "grad_norm": 0.306640625, "learning_rate": 0.00019677796543197067, "loss": 0.8901, "step": 1760 }, { "epoch": 0.3468946540880503, "grad_norm": 0.330078125, "learning_rate": 0.0001967346341518434, "loss": 0.9216, "step": 1765 }, { "epoch": 0.34787735849056606, "grad_norm": 0.439453125, "learning_rate": 0.00019669101828123975, "loss": 0.9063, "step": 1770 }, { "epoch": 0.34886006289308175, "grad_norm": 0.341796875, "learning_rate": 0.00019664711794847625, "loss": 0.9262, "step": 1775 }, { "epoch": 0.3498427672955975, "grad_norm": 0.3203125, "learning_rate": 0.00019660293328270647, "loss": 0.9045, "step": 1780 }, { "epoch": 0.35082547169811323, "grad_norm": 0.30859375, "learning_rate": 0.00019655846441392035, "loss": 0.895, "step": 1785 }, { "epoch": 0.3518081761006289, "grad_norm": 0.515625, "learning_rate": 0.00019651371147294406, "loss": 0.8893, "step": 1790 }, { "epoch": 0.35279088050314467, "grad_norm": 0.373046875, "learning_rate": 0.00019646867459143942, "loss": 0.8772, "step": 1795 }, { "epoch": 0.35377358490566035, "grad_norm": 0.443359375, "learning_rate": 0.00019642335390190367, "loss": 0.9043, "step": 1800 }, { "epoch": 0.3547562893081761, "grad_norm": 0.37890625, "learning_rate": 0.0001963777495376689, "loss": 0.9123, "step": 1805 }, { "epoch": 0.35573899371069184, "grad_norm": 0.36328125, "learning_rate": 0.00019633186163290183, "loss": 0.9055, "step": 1810 }, { "epoch": 0.35672169811320753, "grad_norm": 0.3359375, "learning_rate": 0.00019628569032260334, "loss": 0.9055, "step": 1815 }, { "epoch": 0.3577044025157233, "grad_norm": 0.298828125, "learning_rate": 0.0001962392357426081, "loss": 0.8827, "step": 1820 }, { "epoch": 0.358687106918239, "grad_norm": 0.291015625, "learning_rate": 0.00019619249802958413, "loss": 0.9038, "step": 1825 }, { "epoch": 0.3596698113207547, "grad_norm": 0.31640625, "learning_rate": 0.00019614547732103242, "loss": 0.9204, "step": 1830 }, { "epoch": 0.36065251572327045, "grad_norm": 0.322265625, "learning_rate": 0.0001960981737552865, "loss": 0.9204, "step": 1835 }, { "epoch": 0.36163522012578614, "grad_norm": 0.294921875, "learning_rate": 0.00019605058747151208, "loss": 0.909, "step": 1840 }, { "epoch": 0.3626179245283019, "grad_norm": 0.34375, "learning_rate": 0.0001960027186097067, "loss": 0.8926, "step": 1845 }, { "epoch": 0.3636006289308176, "grad_norm": 0.318359375, "learning_rate": 0.00019595456731069904, "loss": 0.8953, "step": 1850 }, { "epoch": 0.3645833333333333, "grad_norm": 0.328125, "learning_rate": 0.00019590613371614892, "loss": 0.9001, "step": 1855 }, { "epoch": 0.36556603773584906, "grad_norm": 0.359375, "learning_rate": 0.00019585741796854654, "loss": 0.9132, "step": 1860 }, { "epoch": 0.3665487421383648, "grad_norm": 0.349609375, "learning_rate": 0.00019580842021121213, "loss": 0.8892, "step": 1865 }, { "epoch": 0.3675314465408805, "grad_norm": 0.51171875, "learning_rate": 0.00019575914058829577, "loss": 0.9132, "step": 1870 }, { "epoch": 0.36851415094339623, "grad_norm": 0.2890625, "learning_rate": 0.00019570957924477665, "loss": 0.8973, "step": 1875 }, { "epoch": 0.3694968553459119, "grad_norm": 0.35546875, "learning_rate": 0.00019565973632646277, "loss": 0.9052, "step": 1880 }, { "epoch": 0.37047955974842767, "grad_norm": 0.287109375, "learning_rate": 0.00019560961197999052, "loss": 0.8967, "step": 1885 }, { "epoch": 0.3714622641509434, "grad_norm": 0.390625, "learning_rate": 0.00019555920635282433, "loss": 0.8919, "step": 1890 }, { "epoch": 0.3724449685534591, "grad_norm": 0.36328125, "learning_rate": 0.0001955085195932561, "loss": 0.904, "step": 1895 }, { "epoch": 0.37342767295597484, "grad_norm": 0.361328125, "learning_rate": 0.00019545755185040474, "loss": 0.8911, "step": 1900 }, { "epoch": 0.3744103773584906, "grad_norm": 0.357421875, "learning_rate": 0.00019540630327421587, "loss": 0.8749, "step": 1905 }, { "epoch": 0.3753930817610063, "grad_norm": 0.296875, "learning_rate": 0.00019535477401546133, "loss": 0.9019, "step": 1910 }, { "epoch": 0.376375786163522, "grad_norm": 0.318359375, "learning_rate": 0.00019530296422573873, "loss": 0.9033, "step": 1915 }, { "epoch": 0.37735849056603776, "grad_norm": 0.314453125, "learning_rate": 0.00019525087405747088, "loss": 0.8746, "step": 1920 }, { "epoch": 0.37834119496855345, "grad_norm": 0.33203125, "learning_rate": 0.0001951985036639056, "loss": 0.908, "step": 1925 }, { "epoch": 0.3793238993710692, "grad_norm": 0.30078125, "learning_rate": 0.0001951458531991151, "loss": 0.8843, "step": 1930 }, { "epoch": 0.3803066037735849, "grad_norm": 0.423828125, "learning_rate": 0.0001950929228179954, "loss": 0.907, "step": 1935 }, { "epoch": 0.3812893081761006, "grad_norm": 0.333984375, "learning_rate": 0.00019503971267626621, "loss": 0.8921, "step": 1940 }, { "epoch": 0.38227201257861637, "grad_norm": 0.296875, "learning_rate": 0.00019498622293047025, "loss": 0.8808, "step": 1945 }, { "epoch": 0.38325471698113206, "grad_norm": 0.400390625, "learning_rate": 0.00019493245373797271, "loss": 0.9007, "step": 1950 }, { "epoch": 0.3842374213836478, "grad_norm": 0.39453125, "learning_rate": 0.00019487840525696105, "loss": 0.8806, "step": 1955 }, { "epoch": 0.38522012578616355, "grad_norm": 0.341796875, "learning_rate": 0.0001948240776464443, "loss": 0.9046, "step": 1960 }, { "epoch": 0.38620283018867924, "grad_norm": 0.287109375, "learning_rate": 0.00019476947106625273, "loss": 0.8808, "step": 1965 }, { "epoch": 0.387185534591195, "grad_norm": 0.30859375, "learning_rate": 0.0001947145856770373, "loss": 0.8979, "step": 1970 }, { "epoch": 0.38816823899371067, "grad_norm": 0.328125, "learning_rate": 0.0001946594216402692, "loss": 0.8732, "step": 1975 }, { "epoch": 0.3891509433962264, "grad_norm": 0.431640625, "learning_rate": 0.00019460397911823945, "loss": 0.8917, "step": 1980 }, { "epoch": 0.39013364779874216, "grad_norm": 0.41796875, "learning_rate": 0.00019454825827405834, "loss": 0.882, "step": 1985 }, { "epoch": 0.39111635220125784, "grad_norm": 0.388671875, "learning_rate": 0.00019449225927165492, "loss": 0.8909, "step": 1990 }, { "epoch": 0.3920990566037736, "grad_norm": 0.373046875, "learning_rate": 0.00019443598227577674, "loss": 0.8806, "step": 1995 }, { "epoch": 0.39308176100628933, "grad_norm": 0.439453125, "learning_rate": 0.00019437942745198893, "loss": 0.9044, "step": 2000 }, { "epoch": 0.394064465408805, "grad_norm": 0.349609375, "learning_rate": 0.00019432259496667424, "loss": 0.9004, "step": 2005 }, { "epoch": 0.39504716981132076, "grad_norm": 0.296875, "learning_rate": 0.00019426548498703217, "loss": 0.9082, "step": 2010 }, { "epoch": 0.39602987421383645, "grad_norm": 0.29296875, "learning_rate": 0.0001942080976810786, "loss": 0.8808, "step": 2015 }, { "epoch": 0.3970125786163522, "grad_norm": 0.36328125, "learning_rate": 0.00019415043321764527, "loss": 0.8773, "step": 2020 }, { "epoch": 0.39799528301886794, "grad_norm": 0.32421875, "learning_rate": 0.00019409249176637945, "loss": 0.8632, "step": 2025 }, { "epoch": 0.39897798742138363, "grad_norm": 0.37890625, "learning_rate": 0.00019403427349774314, "loss": 0.9005, "step": 2030 }, { "epoch": 0.3999606918238994, "grad_norm": 0.36328125, "learning_rate": 0.0001939757785830128, "loss": 0.8913, "step": 2035 }, { "epoch": 0.4009433962264151, "grad_norm": 0.294921875, "learning_rate": 0.00019391700719427872, "loss": 0.8829, "step": 2040 }, { "epoch": 0.4019261006289308, "grad_norm": 0.28515625, "learning_rate": 0.00019385795950444473, "loss": 0.8666, "step": 2045 }, { "epoch": 0.40290880503144655, "grad_norm": 0.33984375, "learning_rate": 0.00019379863568722732, "loss": 0.9056, "step": 2050 }, { "epoch": 0.40389150943396224, "grad_norm": 0.361328125, "learning_rate": 0.00019373903591715544, "loss": 0.887, "step": 2055 }, { "epoch": 0.404874213836478, "grad_norm": 0.494140625, "learning_rate": 0.0001936791603695699, "loss": 0.8828, "step": 2060 }, { "epoch": 0.4058569182389937, "grad_norm": 0.296875, "learning_rate": 0.00019361900922062282, "loss": 0.9003, "step": 2065 }, { "epoch": 0.4068396226415094, "grad_norm": 0.326171875, "learning_rate": 0.00019355858264727714, "loss": 0.8674, "step": 2070 }, { "epoch": 0.40782232704402516, "grad_norm": 0.2890625, "learning_rate": 0.00019349788082730603, "loss": 0.877, "step": 2075 }, { "epoch": 0.4088050314465409, "grad_norm": 0.279296875, "learning_rate": 0.00019343690393929251, "loss": 0.8493, "step": 2080 }, { "epoch": 0.4097877358490566, "grad_norm": 0.3828125, "learning_rate": 0.00019337565216262878, "loss": 0.8754, "step": 2085 }, { "epoch": 0.41077044025157233, "grad_norm": 0.306640625, "learning_rate": 0.00019331412567751585, "loss": 0.8928, "step": 2090 }, { "epoch": 0.4117531446540881, "grad_norm": 0.4609375, "learning_rate": 0.0001932523246649628, "loss": 0.8881, "step": 2095 }, { "epoch": 0.41273584905660377, "grad_norm": 0.330078125, "learning_rate": 0.0001931902493067864, "loss": 0.8883, "step": 2100 }, { "epoch": 0.4137185534591195, "grad_norm": 0.29296875, "learning_rate": 0.00019312789978561057, "loss": 0.8558, "step": 2105 }, { "epoch": 0.4147012578616352, "grad_norm": 0.3515625, "learning_rate": 0.00019306527628486578, "loss": 0.8752, "step": 2110 }, { "epoch": 0.41568396226415094, "grad_norm": 0.298828125, "learning_rate": 0.00019300237898878852, "loss": 0.9016, "step": 2115 }, { "epoch": 0.4166666666666667, "grad_norm": 0.34765625, "learning_rate": 0.00019293920808242083, "loss": 0.8701, "step": 2120 }, { "epoch": 0.4176493710691824, "grad_norm": 0.375, "learning_rate": 0.00019287576375160968, "loss": 0.8905, "step": 2125 }, { "epoch": 0.4186320754716981, "grad_norm": 0.51953125, "learning_rate": 0.00019281204618300644, "loss": 0.8675, "step": 2130 }, { "epoch": 0.41961477987421386, "grad_norm": 0.314453125, "learning_rate": 0.00019274805556406633, "loss": 0.8766, "step": 2135 }, { "epoch": 0.42059748427672955, "grad_norm": 0.376953125, "learning_rate": 0.00019268379208304789, "loss": 0.8839, "step": 2140 }, { "epoch": 0.4215801886792453, "grad_norm": 0.326171875, "learning_rate": 0.0001926192559290124, "loss": 0.9031, "step": 2145 }, { "epoch": 0.422562893081761, "grad_norm": 0.373046875, "learning_rate": 0.00019255444729182337, "loss": 0.8841, "step": 2150 }, { "epoch": 0.4235455974842767, "grad_norm": 0.318359375, "learning_rate": 0.00019248936636214592, "loss": 0.8584, "step": 2155 }, { "epoch": 0.42452830188679247, "grad_norm": 0.291015625, "learning_rate": 0.00019242401333144623, "loss": 0.8829, "step": 2160 }, { "epoch": 0.42551100628930816, "grad_norm": 0.310546875, "learning_rate": 0.00019235838839199102, "loss": 0.8676, "step": 2165 }, { "epoch": 0.4264937106918239, "grad_norm": 0.34765625, "learning_rate": 0.00019229249173684693, "loss": 0.8983, "step": 2170 }, { "epoch": 0.42747641509433965, "grad_norm": 0.314453125, "learning_rate": 0.00019222632355988007, "loss": 0.8851, "step": 2175 }, { "epoch": 0.42845911949685533, "grad_norm": 0.33984375, "learning_rate": 0.00019215988405575524, "loss": 0.8752, "step": 2180 }, { "epoch": 0.4294418238993711, "grad_norm": 0.369140625, "learning_rate": 0.0001920931734199355, "loss": 0.87, "step": 2185 }, { "epoch": 0.43042452830188677, "grad_norm": 0.42578125, "learning_rate": 0.00019202619184868167, "loss": 0.8567, "step": 2190 }, { "epoch": 0.4314072327044025, "grad_norm": 0.322265625, "learning_rate": 0.00019195893953905153, "loss": 0.8719, "step": 2195 }, { "epoch": 0.43238993710691825, "grad_norm": 0.35546875, "learning_rate": 0.00019189141668889942, "loss": 0.8854, "step": 2200 }, { "epoch": 0.43337264150943394, "grad_norm": 0.31640625, "learning_rate": 0.00019182362349687559, "loss": 0.8816, "step": 2205 }, { "epoch": 0.4343553459119497, "grad_norm": 0.36328125, "learning_rate": 0.00019175556016242566, "loss": 0.8858, "step": 2210 }, { "epoch": 0.43533805031446543, "grad_norm": 0.3515625, "learning_rate": 0.00019168722688578998, "loss": 0.8927, "step": 2215 }, { "epoch": 0.4363207547169811, "grad_norm": 0.7734375, "learning_rate": 0.00019161862386800303, "loss": 0.8676, "step": 2220 }, { "epoch": 0.43730345911949686, "grad_norm": 0.35546875, "learning_rate": 0.00019154975131089293, "loss": 0.861, "step": 2225 }, { "epoch": 0.4382861635220126, "grad_norm": 0.3828125, "learning_rate": 0.0001914806094170807, "loss": 0.894, "step": 2230 }, { "epoch": 0.4392688679245283, "grad_norm": 0.318359375, "learning_rate": 0.00019141119838997982, "loss": 0.8662, "step": 2235 }, { "epoch": 0.44025157232704404, "grad_norm": 0.373046875, "learning_rate": 0.00019134151843379544, "loss": 0.8717, "step": 2240 }, { "epoch": 0.4412342767295597, "grad_norm": 0.322265625, "learning_rate": 0.00019127156975352406, "loss": 0.8654, "step": 2245 }, { "epoch": 0.44221698113207547, "grad_norm": 0.310546875, "learning_rate": 0.00019120135255495257, "loss": 0.8714, "step": 2250 }, { "epoch": 0.4431996855345912, "grad_norm": 0.3046875, "learning_rate": 0.00019113086704465796, "loss": 0.8621, "step": 2255 }, { "epoch": 0.4441823899371069, "grad_norm": 0.314453125, "learning_rate": 0.00019106011343000655, "loss": 0.8671, "step": 2260 }, { "epoch": 0.44516509433962265, "grad_norm": 0.296875, "learning_rate": 0.00019098909191915344, "loss": 0.8493, "step": 2265 }, { "epoch": 0.4461477987421384, "grad_norm": 0.34765625, "learning_rate": 0.00019091780272104182, "loss": 0.8721, "step": 2270 }, { "epoch": 0.4471305031446541, "grad_norm": 0.5234375, "learning_rate": 0.0001908462460454024, "loss": 0.9037, "step": 2275 }, { "epoch": 0.4481132075471698, "grad_norm": 0.40234375, "learning_rate": 0.0001907744221027529, "loss": 0.8682, "step": 2280 }, { "epoch": 0.4490959119496855, "grad_norm": 0.3203125, "learning_rate": 0.00019070233110439721, "loss": 0.8732, "step": 2285 }, { "epoch": 0.45007861635220126, "grad_norm": 0.310546875, "learning_rate": 0.000190629973262425, "loss": 0.8801, "step": 2290 }, { "epoch": 0.451061320754717, "grad_norm": 0.314453125, "learning_rate": 0.0001905573487897109, "loss": 0.8906, "step": 2295 }, { "epoch": 0.4520440251572327, "grad_norm": 0.361328125, "learning_rate": 0.000190484457899914, "loss": 0.8755, "step": 2300 }, { "epoch": 0.45302672955974843, "grad_norm": 0.6484375, "learning_rate": 0.00019041130080747718, "loss": 0.8798, "step": 2305 }, { "epoch": 0.4540094339622642, "grad_norm": 0.34375, "learning_rate": 0.00019033787772762645, "loss": 0.8659, "step": 2310 }, { "epoch": 0.45499213836477986, "grad_norm": 0.3203125, "learning_rate": 0.0001902641888763704, "loss": 0.8803, "step": 2315 }, { "epoch": 0.4559748427672956, "grad_norm": 0.3046875, "learning_rate": 0.00019019023447049951, "loss": 0.8882, "step": 2320 }, { "epoch": 0.4569575471698113, "grad_norm": 0.29296875, "learning_rate": 0.0001901160147275854, "loss": 0.8623, "step": 2325 }, { "epoch": 0.45794025157232704, "grad_norm": 0.35546875, "learning_rate": 0.00019004152986598052, "loss": 0.8802, "step": 2330 }, { "epoch": 0.4589229559748428, "grad_norm": 0.283203125, "learning_rate": 0.00018996678010481705, "loss": 0.8626, "step": 2335 }, { "epoch": 0.45990566037735847, "grad_norm": 0.3046875, "learning_rate": 0.00018989176566400667, "loss": 0.8727, "step": 2340 }, { "epoch": 0.4608883647798742, "grad_norm": 0.306640625, "learning_rate": 0.00018981648676423966, "loss": 0.8822, "step": 2345 }, { "epoch": 0.46187106918238996, "grad_norm": 0.380859375, "learning_rate": 0.00018974094362698437, "loss": 0.849, "step": 2350 }, { "epoch": 0.46285377358490565, "grad_norm": 0.44140625, "learning_rate": 0.0001896651364744865, "loss": 0.8585, "step": 2355 }, { "epoch": 0.4638364779874214, "grad_norm": 0.28515625, "learning_rate": 0.00018958906552976842, "loss": 0.8691, "step": 2360 }, { "epoch": 0.4648191823899371, "grad_norm": 0.302734375, "learning_rate": 0.00018951273101662874, "loss": 0.8903, "step": 2365 }, { "epoch": 0.4658018867924528, "grad_norm": 0.42578125, "learning_rate": 0.00018943613315964132, "loss": 0.8721, "step": 2370 }, { "epoch": 0.46678459119496857, "grad_norm": 0.484375, "learning_rate": 0.00018935927218415483, "loss": 0.8842, "step": 2375 }, { "epoch": 0.46776729559748426, "grad_norm": 0.453125, "learning_rate": 0.00018928214831629204, "loss": 0.8656, "step": 2380 }, { "epoch": 0.46875, "grad_norm": 0.365234375, "learning_rate": 0.00018920476178294909, "loss": 0.8631, "step": 2385 }, { "epoch": 0.46973270440251574, "grad_norm": 0.314453125, "learning_rate": 0.0001891271128117949, "loss": 0.8863, "step": 2390 }, { "epoch": 0.47071540880503143, "grad_norm": 0.296875, "learning_rate": 0.00018904920163127054, "loss": 0.863, "step": 2395 }, { "epoch": 0.4716981132075472, "grad_norm": 0.357421875, "learning_rate": 0.00018897102847058837, "loss": 0.8759, "step": 2400 }, { "epoch": 0.4726808176100629, "grad_norm": 0.33203125, "learning_rate": 0.00018889259355973163, "loss": 0.8641, "step": 2405 }, { "epoch": 0.4736635220125786, "grad_norm": 0.373046875, "learning_rate": 0.00018881389712945349, "loss": 0.8534, "step": 2410 }, { "epoch": 0.47464622641509435, "grad_norm": 0.55078125, "learning_rate": 0.00018873493941127652, "loss": 0.864, "step": 2415 }, { "epoch": 0.47562893081761004, "grad_norm": 0.408203125, "learning_rate": 0.0001886557206374921, "loss": 0.8702, "step": 2420 }, { "epoch": 0.4766116352201258, "grad_norm": 0.291015625, "learning_rate": 0.0001885762410411595, "loss": 0.8608, "step": 2425 }, { "epoch": 0.47759433962264153, "grad_norm": 0.30859375, "learning_rate": 0.0001884965008561054, "loss": 0.8864, "step": 2430 }, { "epoch": 0.4785770440251572, "grad_norm": 0.294921875, "learning_rate": 0.00018841650031692312, "loss": 0.875, "step": 2435 }, { "epoch": 0.47955974842767296, "grad_norm": 0.3046875, "learning_rate": 0.0001883362396589719, "loss": 0.8455, "step": 2440 }, { "epoch": 0.4805424528301887, "grad_norm": 0.34375, "learning_rate": 0.00018825571911837625, "loss": 0.8611, "step": 2445 }, { "epoch": 0.4815251572327044, "grad_norm": 0.40625, "learning_rate": 0.00018817493893202527, "loss": 0.8669, "step": 2450 }, { "epoch": 0.48250786163522014, "grad_norm": 0.37890625, "learning_rate": 0.0001880938993375719, "loss": 0.8904, "step": 2455 }, { "epoch": 0.4834905660377358, "grad_norm": 0.287109375, "learning_rate": 0.0001880126005734323, "loss": 0.8513, "step": 2460 }, { "epoch": 0.48447327044025157, "grad_norm": 0.314453125, "learning_rate": 0.00018793104287878504, "loss": 0.8556, "step": 2465 }, { "epoch": 0.4854559748427673, "grad_norm": 0.2890625, "learning_rate": 0.00018784922649357045, "loss": 0.8426, "step": 2470 }, { "epoch": 0.486438679245283, "grad_norm": 0.38671875, "learning_rate": 0.00018776715165849003, "loss": 0.8405, "step": 2475 }, { "epoch": 0.48742138364779874, "grad_norm": 0.369140625, "learning_rate": 0.00018768481861500548, "loss": 0.8591, "step": 2480 }, { "epoch": 0.4884040880503145, "grad_norm": 0.3359375, "learning_rate": 0.00018760222760533826, "loss": 0.8661, "step": 2485 }, { "epoch": 0.4893867924528302, "grad_norm": 0.3046875, "learning_rate": 0.0001875193788724687, "loss": 0.8563, "step": 2490 }, { "epoch": 0.4903694968553459, "grad_norm": 0.306640625, "learning_rate": 0.00018743627266013535, "loss": 0.8365, "step": 2495 }, { "epoch": 0.4913522012578616, "grad_norm": 0.38671875, "learning_rate": 0.0001873529092128343, "loss": 0.8474, "step": 2500 }, { "epoch": 0.49233490566037735, "grad_norm": 0.365234375, "learning_rate": 0.0001872692887758184, "loss": 0.8671, "step": 2505 }, { "epoch": 0.4933176100628931, "grad_norm": 0.306640625, "learning_rate": 0.00018718541159509644, "loss": 0.8693, "step": 2510 }, { "epoch": 0.4943003144654088, "grad_norm": 0.36328125, "learning_rate": 0.0001871012779174327, "loss": 0.8819, "step": 2515 }, { "epoch": 0.49528301886792453, "grad_norm": 0.4453125, "learning_rate": 0.00018701688799034605, "loss": 0.8527, "step": 2520 }, { "epoch": 0.4962657232704403, "grad_norm": 0.490234375, "learning_rate": 0.00018693224206210919, "loss": 0.8662, "step": 2525 }, { "epoch": 0.49724842767295596, "grad_norm": 0.28125, "learning_rate": 0.0001868473403817479, "loss": 0.8502, "step": 2530 }, { "epoch": 0.4982311320754717, "grad_norm": 0.345703125, "learning_rate": 0.00018676218319904048, "loss": 0.8828, "step": 2535 }, { "epoch": 0.4992138364779874, "grad_norm": 0.30859375, "learning_rate": 0.00018667677076451695, "loss": 0.8674, "step": 2540 }, { "epoch": 0.5001965408805031, "grad_norm": 0.376953125, "learning_rate": 0.00018659110332945814, "loss": 0.8574, "step": 2545 }, { "epoch": 0.5011792452830188, "grad_norm": 0.37890625, "learning_rate": 0.00018650518114589516, "loss": 0.8442, "step": 2550 }, { "epoch": 0.5021619496855346, "grad_norm": 0.38671875, "learning_rate": 0.0001864190044666086, "loss": 0.8537, "step": 2555 }, { "epoch": 0.5031446540880503, "grad_norm": 0.31640625, "learning_rate": 0.0001863325735451277, "loss": 0.8644, "step": 2560 }, { "epoch": 0.504127358490566, "grad_norm": 0.30859375, "learning_rate": 0.00018624588863572973, "loss": 0.8486, "step": 2565 }, { "epoch": 0.5051100628930818, "grad_norm": 0.287109375, "learning_rate": 0.00018615894999343918, "loss": 0.8799, "step": 2570 }, { "epoch": 0.5060927672955975, "grad_norm": 0.39453125, "learning_rate": 0.00018607175787402696, "loss": 0.8628, "step": 2575 }, { "epoch": 0.5070754716981132, "grad_norm": 0.36328125, "learning_rate": 0.00018598431253400986, "loss": 0.8723, "step": 2580 }, { "epoch": 0.508058176100629, "grad_norm": 0.29296875, "learning_rate": 0.00018589661423064937, "loss": 0.8562, "step": 2585 }, { "epoch": 0.5090408805031447, "grad_norm": 0.296875, "learning_rate": 0.00018580866322195143, "loss": 0.8732, "step": 2590 }, { "epoch": 0.5100235849056604, "grad_norm": 0.33984375, "learning_rate": 0.00018572045976666534, "loss": 0.8642, "step": 2595 }, { "epoch": 0.5110062893081762, "grad_norm": 0.33984375, "learning_rate": 0.0001856320041242831, "loss": 0.859, "step": 2600 }, { "epoch": 0.5119889937106918, "grad_norm": 0.298828125, "learning_rate": 0.00018554329655503865, "loss": 0.8681, "step": 2605 }, { "epoch": 0.5129716981132075, "grad_norm": 0.392578125, "learning_rate": 0.000185454337319907, "loss": 0.8687, "step": 2610 }, { "epoch": 0.5139544025157232, "grad_norm": 0.4140625, "learning_rate": 0.0001853651266806037, "loss": 0.8473, "step": 2615 }, { "epoch": 0.514937106918239, "grad_norm": 0.408203125, "learning_rate": 0.00018527566489958384, "loss": 0.8722, "step": 2620 }, { "epoch": 0.5159198113207547, "grad_norm": 0.3203125, "learning_rate": 0.00018518595224004136, "loss": 0.8521, "step": 2625 }, { "epoch": 0.5169025157232704, "grad_norm": 0.29296875, "learning_rate": 0.0001850959889659083, "loss": 0.8802, "step": 2630 }, { "epoch": 0.5178852201257862, "grad_norm": 0.291015625, "learning_rate": 0.00018500577534185397, "loss": 0.8393, "step": 2635 }, { "epoch": 0.5188679245283019, "grad_norm": 0.30078125, "learning_rate": 0.0001849153116332843, "loss": 0.8637, "step": 2640 }, { "epoch": 0.5198506289308176, "grad_norm": 0.302734375, "learning_rate": 0.00018482459810634076, "loss": 0.8627, "step": 2645 }, { "epoch": 0.5208333333333334, "grad_norm": 0.33984375, "learning_rate": 0.0001847336350279, "loss": 0.8557, "step": 2650 }, { "epoch": 0.5218160377358491, "grad_norm": 0.33203125, "learning_rate": 0.00018464242266557273, "loss": 0.863, "step": 2655 }, { "epoch": 0.5227987421383647, "grad_norm": 0.30859375, "learning_rate": 0.00018455096128770307, "loss": 0.8662, "step": 2660 }, { "epoch": 0.5237814465408805, "grad_norm": 0.310546875, "learning_rate": 0.00018445925116336768, "loss": 0.8416, "step": 2665 }, { "epoch": 0.5247641509433962, "grad_norm": 0.318359375, "learning_rate": 0.00018436729256237516, "loss": 0.8424, "step": 2670 }, { "epoch": 0.5257468553459119, "grad_norm": 0.298828125, "learning_rate": 0.00018427508575526494, "loss": 0.862, "step": 2675 }, { "epoch": 0.5267295597484277, "grad_norm": 0.328125, "learning_rate": 0.00018418263101330684, "loss": 0.8768, "step": 2680 }, { "epoch": 0.5277122641509434, "grad_norm": 0.498046875, "learning_rate": 0.00018408992860849996, "loss": 0.8545, "step": 2685 }, { "epoch": 0.5286949685534591, "grad_norm": 0.3125, "learning_rate": 0.00018399697881357212, "loss": 0.8423, "step": 2690 }, { "epoch": 0.5296776729559748, "grad_norm": 0.41015625, "learning_rate": 0.0001839037819019789, "loss": 0.8504, "step": 2695 }, { "epoch": 0.5306603773584906, "grad_norm": 0.4375, "learning_rate": 0.00018381033814790287, "loss": 0.896, "step": 2700 }, { "epoch": 0.5316430817610063, "grad_norm": 0.390625, "learning_rate": 0.00018371664782625287, "loss": 0.8659, "step": 2705 }, { "epoch": 0.532625786163522, "grad_norm": 0.3828125, "learning_rate": 0.00018362271121266307, "loss": 0.8628, "step": 2710 }, { "epoch": 0.5336084905660378, "grad_norm": 0.2890625, "learning_rate": 0.00018352852858349227, "loss": 0.8289, "step": 2715 }, { "epoch": 0.5345911949685535, "grad_norm": 0.3046875, "learning_rate": 0.000183434100215823, "loss": 0.8347, "step": 2720 }, { "epoch": 0.5355738993710691, "grad_norm": 0.298828125, "learning_rate": 0.00018333942638746082, "loss": 0.8593, "step": 2725 }, { "epoch": 0.5365566037735849, "grad_norm": 0.30078125, "learning_rate": 0.0001832445073769333, "loss": 0.8467, "step": 2730 }, { "epoch": 0.5375393081761006, "grad_norm": 0.291015625, "learning_rate": 0.00018314934346348947, "loss": 0.8591, "step": 2735 }, { "epoch": 0.5385220125786163, "grad_norm": 0.361328125, "learning_rate": 0.00018305393492709874, "loss": 0.8566, "step": 2740 }, { "epoch": 0.5395047169811321, "grad_norm": 0.408203125, "learning_rate": 0.0001829582820484503, "loss": 0.8534, "step": 2745 }, { "epoch": 0.5404874213836478, "grad_norm": 0.33203125, "learning_rate": 0.00018286238510895208, "loss": 0.8593, "step": 2750 }, { "epoch": 0.5414701257861635, "grad_norm": 0.298828125, "learning_rate": 0.00018276624439073012, "loss": 0.8619, "step": 2755 }, { "epoch": 0.5424528301886793, "grad_norm": 0.5234375, "learning_rate": 0.0001826698601766276, "loss": 0.8458, "step": 2760 }, { "epoch": 0.543435534591195, "grad_norm": 0.328125, "learning_rate": 0.00018257323275020407, "loss": 0.8443, "step": 2765 }, { "epoch": 0.5444182389937107, "grad_norm": 0.294921875, "learning_rate": 0.0001824763623957346, "loss": 0.8522, "step": 2770 }, { "epoch": 0.5454009433962265, "grad_norm": 0.3125, "learning_rate": 0.00018237924939820896, "loss": 0.8463, "step": 2775 }, { "epoch": 0.5463836477987422, "grad_norm": 0.365234375, "learning_rate": 0.00018228189404333075, "loss": 0.8557, "step": 2780 }, { "epoch": 0.5473663522012578, "grad_norm": 0.443359375, "learning_rate": 0.0001821842966175166, "loss": 0.8483, "step": 2785 }, { "epoch": 0.5483490566037735, "grad_norm": 0.291015625, "learning_rate": 0.00018208645740789528, "loss": 0.8223, "step": 2790 }, { "epoch": 0.5493317610062893, "grad_norm": 0.31640625, "learning_rate": 0.00018198837670230694, "loss": 0.8504, "step": 2795 }, { "epoch": 0.550314465408805, "grad_norm": 0.298828125, "learning_rate": 0.0001818900547893021, "loss": 0.8426, "step": 2800 }, { "epoch": 0.5512971698113207, "grad_norm": 0.29296875, "learning_rate": 0.00018179149195814097, "loss": 0.8764, "step": 2805 }, { "epoch": 0.5522798742138365, "grad_norm": 0.33984375, "learning_rate": 0.0001816926884987926, "loss": 0.8288, "step": 2810 }, { "epoch": 0.5532625786163522, "grad_norm": 0.33984375, "learning_rate": 0.00018159364470193381, "loss": 0.841, "step": 2815 }, { "epoch": 0.5542452830188679, "grad_norm": 0.435546875, "learning_rate": 0.0001814943608589486, "loss": 0.8647, "step": 2820 }, { "epoch": 0.5552279874213837, "grad_norm": 0.287109375, "learning_rate": 0.00018139483726192714, "loss": 0.8722, "step": 2825 }, { "epoch": 0.5562106918238994, "grad_norm": 0.30078125, "learning_rate": 0.00018129507420366493, "loss": 0.8462, "step": 2830 }, { "epoch": 0.5571933962264151, "grad_norm": 0.400390625, "learning_rate": 0.00018119507197766202, "loss": 0.8596, "step": 2835 }, { "epoch": 0.5581761006289309, "grad_norm": 0.326171875, "learning_rate": 0.00018109483087812205, "loss": 0.8241, "step": 2840 }, { "epoch": 0.5591588050314465, "grad_norm": 0.375, "learning_rate": 0.00018099435119995136, "loss": 0.8511, "step": 2845 }, { "epoch": 0.5601415094339622, "grad_norm": 0.318359375, "learning_rate": 0.0001808936332387583, "loss": 0.8544, "step": 2850 }, { "epoch": 0.561124213836478, "grad_norm": 0.291015625, "learning_rate": 0.00018079267729085213, "loss": 0.8468, "step": 2855 }, { "epoch": 0.5621069182389937, "grad_norm": 0.3515625, "learning_rate": 0.00018069148365324237, "loss": 0.8427, "step": 2860 }, { "epoch": 0.5630896226415094, "grad_norm": 0.34375, "learning_rate": 0.0001805900526236377, "loss": 0.8559, "step": 2865 }, { "epoch": 0.5640723270440252, "grad_norm": 0.42578125, "learning_rate": 0.00018048838450044526, "loss": 0.88, "step": 2870 }, { "epoch": 0.5650550314465409, "grad_norm": 0.453125, "learning_rate": 0.0001803864795827697, "loss": 0.8573, "step": 2875 }, { "epoch": 0.5660377358490566, "grad_norm": 0.400390625, "learning_rate": 0.00018028433817041236, "loss": 0.8477, "step": 2880 }, { "epoch": 0.5670204402515723, "grad_norm": 0.361328125, "learning_rate": 0.0001801819605638703, "loss": 0.8425, "step": 2885 }, { "epoch": 0.5680031446540881, "grad_norm": 0.322265625, "learning_rate": 0.00018007934706433542, "loss": 0.8495, "step": 2890 }, { "epoch": 0.5689858490566038, "grad_norm": 0.294921875, "learning_rate": 0.00017997649797369365, "loss": 0.8543, "step": 2895 }, { "epoch": 0.5699685534591195, "grad_norm": 0.31640625, "learning_rate": 0.00017987341359452404, "loss": 0.8473, "step": 2900 }, { "epoch": 0.5709512578616353, "grad_norm": 0.318359375, "learning_rate": 0.0001797700942300978, "loss": 0.8441, "step": 2905 }, { "epoch": 0.5719339622641509, "grad_norm": 0.29296875, "learning_rate": 0.00017966654018437757, "loss": 0.8342, "step": 2910 }, { "epoch": 0.5729166666666666, "grad_norm": 0.41796875, "learning_rate": 0.00017956275176201624, "loss": 0.8525, "step": 2915 }, { "epoch": 0.5738993710691824, "grad_norm": 0.33984375, "learning_rate": 0.00017945872926835636, "loss": 0.8348, "step": 2920 }, { "epoch": 0.5748820754716981, "grad_norm": 0.31640625, "learning_rate": 0.0001793544730094291, "loss": 0.8544, "step": 2925 }, { "epoch": 0.5758647798742138, "grad_norm": 0.353515625, "learning_rate": 0.00017924998329195332, "loss": 0.8167, "step": 2930 }, { "epoch": 0.5768474842767296, "grad_norm": 0.296875, "learning_rate": 0.00017914526042333475, "loss": 0.8233, "step": 2935 }, { "epoch": 0.5778301886792453, "grad_norm": 0.455078125, "learning_rate": 0.00017904030471166496, "loss": 0.8263, "step": 2940 }, { "epoch": 0.578812893081761, "grad_norm": 0.498046875, "learning_rate": 0.00017893511646572066, "loss": 0.8455, "step": 2945 }, { "epoch": 0.5797955974842768, "grad_norm": 0.3671875, "learning_rate": 0.00017882969599496254, "loss": 0.8454, "step": 2950 }, { "epoch": 0.5807783018867925, "grad_norm": 0.337890625, "learning_rate": 0.00017872404360953466, "loss": 0.8488, "step": 2955 }, { "epoch": 0.5817610062893082, "grad_norm": 0.337890625, "learning_rate": 0.00017861815962026315, "loss": 0.8565, "step": 2960 }, { "epoch": 0.5827437106918238, "grad_norm": 0.31640625, "learning_rate": 0.00017851204433865566, "loss": 0.8341, "step": 2965 }, { "epoch": 0.5837264150943396, "grad_norm": 0.337890625, "learning_rate": 0.00017840569807690032, "loss": 0.8458, "step": 2970 }, { "epoch": 0.5847091194968553, "grad_norm": 0.490234375, "learning_rate": 0.00017829912114786462, "loss": 0.8504, "step": 2975 }, { "epoch": 0.585691823899371, "grad_norm": 0.328125, "learning_rate": 0.00017819231386509486, "loss": 0.854, "step": 2980 }, { "epoch": 0.5866745283018868, "grad_norm": 0.353515625, "learning_rate": 0.00017808527654281496, "loss": 0.8593, "step": 2985 }, { "epoch": 0.5876572327044025, "grad_norm": 0.291015625, "learning_rate": 0.00017797800949592558, "loss": 0.8598, "step": 2990 }, { "epoch": 0.5886399371069182, "grad_norm": 0.32421875, "learning_rate": 0.00017787051304000322, "loss": 0.8317, "step": 2995 }, { "epoch": 0.589622641509434, "grad_norm": 0.314453125, "learning_rate": 0.00017776278749129937, "loss": 0.8591, "step": 3000 }, { "epoch": 0.5906053459119497, "grad_norm": 0.455078125, "learning_rate": 0.00017765483316673945, "loss": 0.8543, "step": 3005 }, { "epoch": 0.5915880503144654, "grad_norm": 0.294921875, "learning_rate": 0.0001775466503839219, "loss": 0.8477, "step": 3010 }, { "epoch": 0.5925707547169812, "grad_norm": 0.466796875, "learning_rate": 0.00017743823946111736, "loss": 0.8337, "step": 3015 }, { "epoch": 0.5935534591194969, "grad_norm": 0.298828125, "learning_rate": 0.00017732960071726762, "loss": 0.8434, "step": 3020 }, { "epoch": 0.5945361635220126, "grad_norm": 0.333984375, "learning_rate": 0.00017722073447198466, "loss": 0.8241, "step": 3025 }, { "epoch": 0.5955188679245284, "grad_norm": 0.35546875, "learning_rate": 0.00017711164104554982, "loss": 0.8291, "step": 3030 }, { "epoch": 0.596501572327044, "grad_norm": 0.296875, "learning_rate": 0.00017700232075891278, "loss": 0.831, "step": 3035 }, { "epoch": 0.5974842767295597, "grad_norm": 0.29296875, "learning_rate": 0.00017689277393369063, "loss": 0.8397, "step": 3040 }, { "epoch": 0.5984669811320755, "grad_norm": 0.306640625, "learning_rate": 0.00017678300089216692, "loss": 0.8274, "step": 3045 }, { "epoch": 0.5994496855345912, "grad_norm": 0.31640625, "learning_rate": 0.00017667300195729082, "loss": 0.8553, "step": 3050 }, { "epoch": 0.6004323899371069, "grad_norm": 0.30859375, "learning_rate": 0.00017656277745267592, "loss": 0.8322, "step": 3055 }, { "epoch": 0.6014150943396226, "grad_norm": 0.3515625, "learning_rate": 0.00017645232770259952, "loss": 0.8481, "step": 3060 }, { "epoch": 0.6023977987421384, "grad_norm": 0.306640625, "learning_rate": 0.00017634165303200157, "loss": 0.8373, "step": 3065 }, { "epoch": 0.6033805031446541, "grad_norm": 0.31640625, "learning_rate": 0.00017623075376648376, "loss": 0.8587, "step": 3070 }, { "epoch": 0.6043632075471698, "grad_norm": 0.298828125, "learning_rate": 0.00017611963023230845, "loss": 0.8483, "step": 3075 }, { "epoch": 0.6053459119496856, "grad_norm": 0.33203125, "learning_rate": 0.0001760082827563979, "loss": 0.8226, "step": 3080 }, { "epoch": 0.6063286163522013, "grad_norm": 0.310546875, "learning_rate": 0.00017589671166633303, "loss": 0.8524, "step": 3085 }, { "epoch": 0.6073113207547169, "grad_norm": 0.5078125, "learning_rate": 0.00017578491729035287, "loss": 0.8654, "step": 3090 }, { "epoch": 0.6082940251572327, "grad_norm": 0.294921875, "learning_rate": 0.00017567289995735314, "loss": 0.8372, "step": 3095 }, { "epoch": 0.6092767295597484, "grad_norm": 0.314453125, "learning_rate": 0.00017556065999688557, "loss": 0.8416, "step": 3100 }, { "epoch": 0.6102594339622641, "grad_norm": 0.416015625, "learning_rate": 0.0001754481977391569, "loss": 0.8206, "step": 3105 }, { "epoch": 0.6112421383647799, "grad_norm": 0.3359375, "learning_rate": 0.00017533551351502782, "loss": 0.8267, "step": 3110 }, { "epoch": 0.6122248427672956, "grad_norm": 0.306640625, "learning_rate": 0.00017522260765601196, "loss": 0.8503, "step": 3115 }, { "epoch": 0.6132075471698113, "grad_norm": 0.29296875, "learning_rate": 0.00017510948049427513, "loss": 0.8227, "step": 3120 }, { "epoch": 0.6141902515723271, "grad_norm": 0.3046875, "learning_rate": 0.00017499613236263413, "loss": 0.8517, "step": 3125 }, { "epoch": 0.6151729559748428, "grad_norm": 0.318359375, "learning_rate": 0.00017488256359455586, "loss": 0.8442, "step": 3130 }, { "epoch": 0.6161556603773585, "grad_norm": 0.34765625, "learning_rate": 0.00017476877452415638, "loss": 0.8443, "step": 3135 }, { "epoch": 0.6171383647798742, "grad_norm": 0.33984375, "learning_rate": 0.00017465476548619974, "loss": 0.8332, "step": 3140 }, { "epoch": 0.61812106918239, "grad_norm": 0.298828125, "learning_rate": 0.0001745405368160972, "loss": 0.8067, "step": 3145 }, { "epoch": 0.6191037735849056, "grad_norm": 0.30078125, "learning_rate": 0.0001744260888499063, "loss": 0.8161, "step": 3150 }, { "epoch": 0.6200864779874213, "grad_norm": 0.298828125, "learning_rate": 0.0001743114219243295, "loss": 0.8256, "step": 3155 }, { "epoch": 0.6210691823899371, "grad_norm": 0.314453125, "learning_rate": 0.0001741965363767136, "loss": 0.8252, "step": 3160 }, { "epoch": 0.6220518867924528, "grad_norm": 0.357421875, "learning_rate": 0.00017408143254504856, "loss": 0.8465, "step": 3165 }, { "epoch": 0.6230345911949685, "grad_norm": 0.376953125, "learning_rate": 0.00017396611076796645, "loss": 0.8581, "step": 3170 }, { "epoch": 0.6240172955974843, "grad_norm": 0.30859375, "learning_rate": 0.00017385057138474063, "loss": 0.8625, "step": 3175 }, { "epoch": 0.625, "grad_norm": 0.361328125, "learning_rate": 0.0001737348147352846, "loss": 0.8294, "step": 3180 }, { "epoch": 0.6259827044025157, "grad_norm": 0.36328125, "learning_rate": 0.000173618841160151, "loss": 0.851, "step": 3185 }, { "epoch": 0.6269654088050315, "grad_norm": 0.306640625, "learning_rate": 0.00017350265100053074, "loss": 0.8578, "step": 3190 }, { "epoch": 0.6279481132075472, "grad_norm": 0.30859375, "learning_rate": 0.00017338624459825187, "loss": 0.8397, "step": 3195 }, { "epoch": 0.6289308176100629, "grad_norm": 0.369140625, "learning_rate": 0.00017326962229577867, "loss": 0.8468, "step": 3200 }, { "epoch": 0.6299135220125787, "grad_norm": 0.33203125, "learning_rate": 0.00017315278443621055, "loss": 0.8292, "step": 3205 }, { "epoch": 0.6308962264150944, "grad_norm": 0.298828125, "learning_rate": 0.0001730357313632811, "loss": 0.8393, "step": 3210 }, { "epoch": 0.63187893081761, "grad_norm": 0.326171875, "learning_rate": 0.00017291846342135697, "loss": 0.8371, "step": 3215 }, { "epoch": 0.6328616352201258, "grad_norm": 0.47265625, "learning_rate": 0.00017280098095543716, "loss": 0.8396, "step": 3220 }, { "epoch": 0.6338443396226415, "grad_norm": 0.365234375, "learning_rate": 0.00017268328431115155, "loss": 0.8234, "step": 3225 }, { "epoch": 0.6348270440251572, "grad_norm": 0.33984375, "learning_rate": 0.0001725653738347603, "loss": 0.8238, "step": 3230 }, { "epoch": 0.6358097484276729, "grad_norm": 0.5234375, "learning_rate": 0.00017244724987315255, "loss": 0.8451, "step": 3235 }, { "epoch": 0.6367924528301887, "grad_norm": 0.3125, "learning_rate": 0.00017232891277384562, "loss": 0.833, "step": 3240 }, { "epoch": 0.6377751572327044, "grad_norm": 0.376953125, "learning_rate": 0.0001722103628849838, "loss": 0.8267, "step": 3245 }, { "epoch": 0.6387578616352201, "grad_norm": 0.37890625, "learning_rate": 0.00017209160055533734, "loss": 0.8342, "step": 3250 }, { "epoch": 0.6397405660377359, "grad_norm": 0.333984375, "learning_rate": 0.00017197262613430158, "loss": 0.8224, "step": 3255 }, { "epoch": 0.6407232704402516, "grad_norm": 0.359375, "learning_rate": 0.00017185343997189588, "loss": 0.831, "step": 3260 }, { "epoch": 0.6417059748427673, "grad_norm": 0.5859375, "learning_rate": 0.00017173404241876237, "loss": 0.8337, "step": 3265 }, { "epoch": 0.6426886792452831, "grad_norm": 0.3046875, "learning_rate": 0.0001716144338261652, "loss": 0.8413, "step": 3270 }, { "epoch": 0.6436713836477987, "grad_norm": 0.3671875, "learning_rate": 0.0001714946145459894, "loss": 0.8254, "step": 3275 }, { "epoch": 0.6446540880503144, "grad_norm": 0.326171875, "learning_rate": 0.00017137458493073977, "loss": 0.8219, "step": 3280 }, { "epoch": 0.6456367924528302, "grad_norm": 0.4609375, "learning_rate": 0.00017125434533353992, "loss": 0.8461, "step": 3285 }, { "epoch": 0.6466194968553459, "grad_norm": 0.431640625, "learning_rate": 0.00017113389610813132, "loss": 0.8407, "step": 3290 }, { "epoch": 0.6476022012578616, "grad_norm": 0.3046875, "learning_rate": 0.000171013237608872, "loss": 0.8413, "step": 3295 }, { "epoch": 0.6485849056603774, "grad_norm": 0.3046875, "learning_rate": 0.00017089237019073578, "loss": 0.8479, "step": 3300 }, { "epoch": 0.6495676100628931, "grad_norm": 0.296875, "learning_rate": 0.0001707712942093111, "loss": 0.8321, "step": 3305 }, { "epoch": 0.6505503144654088, "grad_norm": 0.296875, "learning_rate": 0.00017065001002079995, "loss": 0.8345, "step": 3310 }, { "epoch": 0.6515330188679245, "grad_norm": 0.306640625, "learning_rate": 0.0001705285179820169, "loss": 0.8397, "step": 3315 }, { "epoch": 0.6525157232704403, "grad_norm": 0.34765625, "learning_rate": 0.00017040681845038798, "loss": 0.8148, "step": 3320 }, { "epoch": 0.653498427672956, "grad_norm": 0.298828125, "learning_rate": 0.00017028491178394965, "loss": 0.8239, "step": 3325 }, { "epoch": 0.6544811320754716, "grad_norm": 0.30859375, "learning_rate": 0.0001701627983413478, "loss": 0.8126, "step": 3330 }, { "epoch": 0.6554638364779874, "grad_norm": 0.3203125, "learning_rate": 0.0001700404784818366, "loss": 0.8538, "step": 3335 }, { "epoch": 0.6564465408805031, "grad_norm": 0.3125, "learning_rate": 0.00016991795256527756, "loss": 0.865, "step": 3340 }, { "epoch": 0.6574292452830188, "grad_norm": 0.55859375, "learning_rate": 0.00016979522095213832, "loss": 0.8402, "step": 3345 }, { "epoch": 0.6584119496855346, "grad_norm": 0.48046875, "learning_rate": 0.00016967228400349167, "loss": 0.8473, "step": 3350 }, { "epoch": 0.6593946540880503, "grad_norm": 0.3671875, "learning_rate": 0.0001695491420810146, "loss": 0.8289, "step": 3355 }, { "epoch": 0.660377358490566, "grad_norm": 0.31640625, "learning_rate": 0.00016942579554698708, "loss": 0.8312, "step": 3360 }, { "epoch": 0.6613600628930818, "grad_norm": 0.458984375, "learning_rate": 0.00016930224476429092, "loss": 0.8244, "step": 3365 }, { "epoch": 0.6623427672955975, "grad_norm": 0.384765625, "learning_rate": 0.00016917849009640904, "loss": 0.8364, "step": 3370 }, { "epoch": 0.6633254716981132, "grad_norm": 0.333984375, "learning_rate": 0.00016905453190742397, "loss": 0.8244, "step": 3375 }, { "epoch": 0.664308176100629, "grad_norm": 0.330078125, "learning_rate": 0.00016893037056201713, "loss": 0.8595, "step": 3380 }, { "epoch": 0.6652908805031447, "grad_norm": 0.349609375, "learning_rate": 0.00016880600642546763, "loss": 0.8357, "step": 3385 }, { "epoch": 0.6662735849056604, "grad_norm": 0.37109375, "learning_rate": 0.00016868143986365107, "loss": 0.8388, "step": 3390 }, { "epoch": 0.6672562893081762, "grad_norm": 0.302734375, "learning_rate": 0.00016855667124303865, "loss": 0.8117, "step": 3395 }, { "epoch": 0.6682389937106918, "grad_norm": 0.3828125, "learning_rate": 0.00016843170093069605, "loss": 0.8343, "step": 3400 }, { "epoch": 0.6692216981132075, "grad_norm": 0.318359375, "learning_rate": 0.00016830652929428224, "loss": 0.848, "step": 3405 }, { "epoch": 0.6702044025157232, "grad_norm": 0.3359375, "learning_rate": 0.00016818115670204863, "loss": 0.8264, "step": 3410 }, { "epoch": 0.671187106918239, "grad_norm": 0.498046875, "learning_rate": 0.00016805558352283768, "loss": 0.8364, "step": 3415 }, { "epoch": 0.6721698113207547, "grad_norm": 0.435546875, "learning_rate": 0.00016792981012608198, "loss": 0.8046, "step": 3420 }, { "epoch": 0.6731525157232704, "grad_norm": 0.375, "learning_rate": 0.00016780383688180323, "loss": 0.8455, "step": 3425 }, { "epoch": 0.6741352201257862, "grad_norm": 0.322265625, "learning_rate": 0.00016767766416061108, "loss": 0.8307, "step": 3430 }, { "epoch": 0.6751179245283019, "grad_norm": 0.306640625, "learning_rate": 0.00016755129233370197, "loss": 0.8364, "step": 3435 }, { "epoch": 0.6761006289308176, "grad_norm": 0.36328125, "learning_rate": 0.00016742472177285812, "loss": 0.834, "step": 3440 }, { "epoch": 0.6770833333333334, "grad_norm": 0.30859375, "learning_rate": 0.0001672979528504465, "loss": 0.8306, "step": 3445 }, { "epoch": 0.6780660377358491, "grad_norm": 0.388671875, "learning_rate": 0.00016717098593941752, "loss": 0.837, "step": 3450 }, { "epoch": 0.6790487421383647, "grad_norm": 0.41015625, "learning_rate": 0.00016704382141330415, "loss": 0.8207, "step": 3455 }, { "epoch": 0.6800314465408805, "grad_norm": 0.318359375, "learning_rate": 0.00016691645964622074, "loss": 0.8276, "step": 3460 }, { "epoch": 0.6810141509433962, "grad_norm": 0.3828125, "learning_rate": 0.00016678890101286186, "loss": 0.8383, "step": 3465 }, { "epoch": 0.6819968553459119, "grad_norm": 0.30859375, "learning_rate": 0.00016666114588850133, "loss": 0.8393, "step": 3470 }, { "epoch": 0.6829795597484277, "grad_norm": 0.33984375, "learning_rate": 0.00016653319464899103, "loss": 0.824, "step": 3475 }, { "epoch": 0.6839622641509434, "grad_norm": 0.37109375, "learning_rate": 0.0001664050476707597, "loss": 0.8223, "step": 3480 }, { "epoch": 0.6849449685534591, "grad_norm": 0.30859375, "learning_rate": 0.00016627670533081213, "loss": 0.8293, "step": 3485 }, { "epoch": 0.6859276729559748, "grad_norm": 0.31640625, "learning_rate": 0.00016614816800672764, "loss": 0.8357, "step": 3490 }, { "epoch": 0.6869103773584906, "grad_norm": 0.337890625, "learning_rate": 0.00016601943607665932, "loss": 0.8422, "step": 3495 }, { "epoch": 0.6878930817610063, "grad_norm": 0.326171875, "learning_rate": 0.0001658905099193328, "loss": 0.8289, "step": 3500 }, { "epoch": 0.688875786163522, "grad_norm": 0.447265625, "learning_rate": 0.00016576138991404506, "loss": 0.8014, "step": 3505 }, { "epoch": 0.6898584905660378, "grad_norm": 0.396484375, "learning_rate": 0.00016563207644066337, "loss": 0.8128, "step": 3510 }, { "epoch": 0.6908411949685535, "grad_norm": 0.39453125, "learning_rate": 0.00016550256987962425, "loss": 0.8319, "step": 3515 }, { "epoch": 0.6918238993710691, "grad_norm": 0.32421875, "learning_rate": 0.00016537287061193218, "loss": 0.8284, "step": 3520 }, { "epoch": 0.6928066037735849, "grad_norm": 0.326171875, "learning_rate": 0.00016524297901915867, "loss": 0.8536, "step": 3525 }, { "epoch": 0.6937893081761006, "grad_norm": 0.384765625, "learning_rate": 0.00016511289548344098, "loss": 0.8315, "step": 3530 }, { "epoch": 0.6947720125786163, "grad_norm": 0.298828125, "learning_rate": 0.0001649826203874811, "loss": 0.8381, "step": 3535 }, { "epoch": 0.6957547169811321, "grad_norm": 0.31640625, "learning_rate": 0.00016485215411454453, "loss": 0.8288, "step": 3540 }, { "epoch": 0.6967374213836478, "grad_norm": 0.30078125, "learning_rate": 0.00016472149704845927, "loss": 0.8327, "step": 3545 }, { "epoch": 0.6977201257861635, "grad_norm": 0.2890625, "learning_rate": 0.00016459064957361465, "loss": 0.8182, "step": 3550 }, { "epoch": 0.6987028301886793, "grad_norm": 0.310546875, "learning_rate": 0.00016445961207496004, "loss": 0.8309, "step": 3555 }, { "epoch": 0.699685534591195, "grad_norm": 0.310546875, "learning_rate": 0.00016432838493800401, "loss": 0.8236, "step": 3560 }, { "epoch": 0.7006682389937107, "grad_norm": 0.3125, "learning_rate": 0.00016419696854881298, "loss": 0.824, "step": 3565 }, { "epoch": 0.7016509433962265, "grad_norm": 0.419921875, "learning_rate": 0.00016406536329401008, "loss": 0.8053, "step": 3570 }, { "epoch": 0.7026336477987422, "grad_norm": 0.384765625, "learning_rate": 0.00016393356956077417, "loss": 0.8234, "step": 3575 }, { "epoch": 0.7036163522012578, "grad_norm": 0.43359375, "learning_rate": 0.00016380158773683862, "loss": 0.8286, "step": 3580 }, { "epoch": 0.7045990566037735, "grad_norm": 0.33203125, "learning_rate": 0.00016366941821049005, "loss": 0.8309, "step": 3585 }, { "epoch": 0.7055817610062893, "grad_norm": 0.33203125, "learning_rate": 0.00016353706137056735, "loss": 0.8514, "step": 3590 }, { "epoch": 0.706564465408805, "grad_norm": 0.3671875, "learning_rate": 0.00016340451760646054, "loss": 0.8247, "step": 3595 }, { "epoch": 0.7075471698113207, "grad_norm": 0.42578125, "learning_rate": 0.00016327178730810948, "loss": 0.8462, "step": 3600 }, { "epoch": 0.7085298742138365, "grad_norm": 0.4296875, "learning_rate": 0.00016313887086600286, "loss": 0.8386, "step": 3605 }, { "epoch": 0.7095125786163522, "grad_norm": 0.330078125, "learning_rate": 0.00016300576867117698, "loss": 0.8303, "step": 3610 }, { "epoch": 0.7104952830188679, "grad_norm": 0.31640625, "learning_rate": 0.0001628724811152146, "loss": 0.8145, "step": 3615 }, { "epoch": 0.7114779874213837, "grad_norm": 0.302734375, "learning_rate": 0.00016273900859024382, "loss": 0.8209, "step": 3620 }, { "epoch": 0.7124606918238994, "grad_norm": 0.330078125, "learning_rate": 0.00016260535148893702, "loss": 0.8262, "step": 3625 }, { "epoch": 0.7134433962264151, "grad_norm": 0.306640625, "learning_rate": 0.00016247151020450933, "loss": 0.8207, "step": 3630 }, { "epoch": 0.7144261006289309, "grad_norm": 0.349609375, "learning_rate": 0.00016233748513071804, "loss": 0.8523, "step": 3635 }, { "epoch": 0.7154088050314465, "grad_norm": 0.3046875, "learning_rate": 0.000162203276661861, "loss": 0.8252, "step": 3640 }, { "epoch": 0.7163915094339622, "grad_norm": 0.34375, "learning_rate": 0.0001620688851927756, "loss": 0.8101, "step": 3645 }, { "epoch": 0.717374213836478, "grad_norm": 0.412109375, "learning_rate": 0.00016193431111883756, "loss": 0.8187, "step": 3650 }, { "epoch": 0.7183569182389937, "grad_norm": 0.310546875, "learning_rate": 0.00016179955483596, "loss": 0.8209, "step": 3655 }, { "epoch": 0.7193396226415094, "grad_norm": 0.34765625, "learning_rate": 0.00016166461674059192, "loss": 0.8181, "step": 3660 }, { "epoch": 0.7203223270440252, "grad_norm": 0.314453125, "learning_rate": 0.00016152949722971727, "loss": 0.824, "step": 3665 }, { "epoch": 0.7213050314465409, "grad_norm": 0.478515625, "learning_rate": 0.00016139419670085372, "loss": 0.8294, "step": 3670 }, { "epoch": 0.7222877358490566, "grad_norm": 0.578125, "learning_rate": 0.00016125871555205148, "loss": 0.8181, "step": 3675 }, { "epoch": 0.7232704402515723, "grad_norm": 0.43359375, "learning_rate": 0.00016112305418189218, "loss": 0.8378, "step": 3680 }, { "epoch": 0.7242531446540881, "grad_norm": 0.294921875, "learning_rate": 0.00016098721298948756, "loss": 0.8201, "step": 3685 }, { "epoch": 0.7252358490566038, "grad_norm": 0.296875, "learning_rate": 0.00016085119237447848, "loss": 0.8445, "step": 3690 }, { "epoch": 0.7262185534591195, "grad_norm": 0.361328125, "learning_rate": 0.00016071499273703364, "loss": 0.8048, "step": 3695 }, { "epoch": 0.7272012578616353, "grad_norm": 0.32421875, "learning_rate": 0.0001605786144778484, "loss": 0.7989, "step": 3700 }, { "epoch": 0.7281839622641509, "grad_norm": 0.357421875, "learning_rate": 0.00016044205799814362, "loss": 0.8117, "step": 3705 }, { "epoch": 0.7291666666666666, "grad_norm": 0.310546875, "learning_rate": 0.00016030532369966448, "loss": 0.8252, "step": 3710 }, { "epoch": 0.7301493710691824, "grad_norm": 0.373046875, "learning_rate": 0.00016016841198467937, "loss": 0.8218, "step": 3715 }, { "epoch": 0.7311320754716981, "grad_norm": 0.435546875, "learning_rate": 0.00016003132325597842, "loss": 0.8254, "step": 3720 }, { "epoch": 0.7321147798742138, "grad_norm": 0.365234375, "learning_rate": 0.00015989405791687285, "loss": 0.827, "step": 3725 }, { "epoch": 0.7330974842767296, "grad_norm": 0.30859375, "learning_rate": 0.00015975661637119317, "loss": 0.8323, "step": 3730 }, { "epoch": 0.7340801886792453, "grad_norm": 0.3984375, "learning_rate": 0.00015961899902328845, "loss": 0.8223, "step": 3735 }, { "epoch": 0.735062893081761, "grad_norm": 0.3203125, "learning_rate": 0.0001594812062780249, "loss": 0.8, "step": 3740 }, { "epoch": 0.7360455974842768, "grad_norm": 0.32421875, "learning_rate": 0.0001593432385407848, "loss": 0.8112, "step": 3745 }, { "epoch": 0.7370283018867925, "grad_norm": 0.330078125, "learning_rate": 0.00015920509621746517, "loss": 0.8146, "step": 3750 }, { "epoch": 0.7380110062893082, "grad_norm": 0.380859375, "learning_rate": 0.00015906677971447674, "loss": 0.8404, "step": 3755 }, { "epoch": 0.7389937106918238, "grad_norm": 0.314453125, "learning_rate": 0.00015892828943874263, "loss": 0.8136, "step": 3760 }, { "epoch": 0.7399764150943396, "grad_norm": 0.30859375, "learning_rate": 0.00015878962579769716, "loss": 0.8105, "step": 3765 }, { "epoch": 0.7409591194968553, "grad_norm": 0.3203125, "learning_rate": 0.0001586507891992848, "loss": 0.8034, "step": 3770 }, { "epoch": 0.741941823899371, "grad_norm": 0.390625, "learning_rate": 0.00015851178005195867, "loss": 0.8042, "step": 3775 }, { "epoch": 0.7429245283018868, "grad_norm": 0.35546875, "learning_rate": 0.0001583725987646797, "loss": 0.83, "step": 3780 }, { "epoch": 0.7439072327044025, "grad_norm": 0.341796875, "learning_rate": 0.00015823324574691517, "loss": 0.8129, "step": 3785 }, { "epoch": 0.7448899371069182, "grad_norm": 0.34765625, "learning_rate": 0.00015809372140863763, "loss": 0.7918, "step": 3790 }, { "epoch": 0.745872641509434, "grad_norm": 0.35546875, "learning_rate": 0.00015795402616032358, "loss": 0.8354, "step": 3795 }, { "epoch": 0.7468553459119497, "grad_norm": 0.31640625, "learning_rate": 0.0001578141604129524, "loss": 0.8187, "step": 3800 }, { "epoch": 0.7478380503144654, "grad_norm": 0.337890625, "learning_rate": 0.00015767412457800504, "loss": 0.8346, "step": 3805 }, { "epoch": 0.7488207547169812, "grad_norm": 0.29296875, "learning_rate": 0.00015753391906746282, "loss": 0.8073, "step": 3810 }, { "epoch": 0.7498034591194969, "grad_norm": 0.3359375, "learning_rate": 0.0001573935442938063, "loss": 0.8113, "step": 3815 }, { "epoch": 0.7507861635220126, "grad_norm": 0.34765625, "learning_rate": 0.00015725300067001395, "loss": 0.8181, "step": 3820 }, { "epoch": 0.7517688679245284, "grad_norm": 0.34765625, "learning_rate": 0.00015711228860956102, "loss": 0.8136, "step": 3825 }, { "epoch": 0.752751572327044, "grad_norm": 0.310546875, "learning_rate": 0.00015697140852641834, "loss": 0.8214, "step": 3830 }, { "epoch": 0.7537342767295597, "grad_norm": 0.53515625, "learning_rate": 0.000156830360835051, "loss": 0.8166, "step": 3835 }, { "epoch": 0.7547169811320755, "grad_norm": 0.41796875, "learning_rate": 0.00015668914595041712, "loss": 0.8477, "step": 3840 }, { "epoch": 0.7556996855345912, "grad_norm": 0.40234375, "learning_rate": 0.00015654776428796686, "loss": 0.8105, "step": 3845 }, { "epoch": 0.7566823899371069, "grad_norm": 0.298828125, "learning_rate": 0.00015640621626364094, "loss": 0.8393, "step": 3850 }, { "epoch": 0.7576650943396226, "grad_norm": 0.427734375, "learning_rate": 0.00015626450229386948, "loss": 0.8025, "step": 3855 }, { "epoch": 0.7586477987421384, "grad_norm": 0.3046875, "learning_rate": 0.00015612262279557094, "loss": 0.833, "step": 3860 }, { "epoch": 0.7596305031446541, "grad_norm": 0.376953125, "learning_rate": 0.0001559805781861506, "loss": 0.8165, "step": 3865 }, { "epoch": 0.7606132075471698, "grad_norm": 0.376953125, "learning_rate": 0.0001558383688834996, "loss": 0.8155, "step": 3870 }, { "epoch": 0.7615959119496856, "grad_norm": 0.326171875, "learning_rate": 0.0001556959953059935, "loss": 0.8105, "step": 3875 }, { "epoch": 0.7625786163522013, "grad_norm": 0.5859375, "learning_rate": 0.00015555345787249128, "loss": 0.8203, "step": 3880 }, { "epoch": 0.7635613207547169, "grad_norm": 0.38671875, "learning_rate": 0.00015541075700233395, "loss": 0.8226, "step": 3885 }, { "epoch": 0.7645440251572327, "grad_norm": 0.49609375, "learning_rate": 0.0001552678931153432, "loss": 0.8373, "step": 3890 }, { "epoch": 0.7655267295597484, "grad_norm": 0.328125, "learning_rate": 0.0001551248666318206, "loss": 0.8278, "step": 3895 }, { "epoch": 0.7665094339622641, "grad_norm": 0.29296875, "learning_rate": 0.0001549816779725457, "loss": 0.8169, "step": 3900 }, { "epoch": 0.7674921383647799, "grad_norm": 0.5, "learning_rate": 0.0001548383275587755, "loss": 0.8137, "step": 3905 }, { "epoch": 0.7684748427672956, "grad_norm": 0.451171875, "learning_rate": 0.00015469481581224272, "loss": 0.8154, "step": 3910 }, { "epoch": 0.7694575471698113, "grad_norm": 0.361328125, "learning_rate": 0.0001545511431551547, "loss": 0.8084, "step": 3915 }, { "epoch": 0.7704402515723271, "grad_norm": 0.35546875, "learning_rate": 0.0001544073100101922, "loss": 0.8502, "step": 3920 }, { "epoch": 0.7714229559748428, "grad_norm": 0.4375, "learning_rate": 0.00015426331680050824, "loss": 0.8345, "step": 3925 }, { "epoch": 0.7724056603773585, "grad_norm": 0.388671875, "learning_rate": 0.00015411916394972655, "loss": 0.7933, "step": 3930 }, { "epoch": 0.7733883647798742, "grad_norm": 0.35546875, "learning_rate": 0.00015397485188194064, "loss": 0.8072, "step": 3935 }, { "epoch": 0.77437106918239, "grad_norm": 0.34765625, "learning_rate": 0.00015383038102171248, "loss": 0.7954, "step": 3940 }, { "epoch": 0.7753537735849056, "grad_norm": 0.31640625, "learning_rate": 0.00015368575179407104, "loss": 0.8003, "step": 3945 }, { "epoch": 0.7763364779874213, "grad_norm": 0.333984375, "learning_rate": 0.00015354096462451134, "loss": 0.8056, "step": 3950 }, { "epoch": 0.7773191823899371, "grad_norm": 0.3046875, "learning_rate": 0.00015339601993899304, "loss": 0.8217, "step": 3955 }, { "epoch": 0.7783018867924528, "grad_norm": 0.310546875, "learning_rate": 0.00015325091816393912, "loss": 0.8441, "step": 3960 }, { "epoch": 0.7792845911949685, "grad_norm": 0.3203125, "learning_rate": 0.00015310565972623483, "loss": 0.8003, "step": 3965 }, { "epoch": 0.7802672955974843, "grad_norm": 0.328125, "learning_rate": 0.00015296024505322625, "loss": 0.8123, "step": 3970 }, { "epoch": 0.78125, "grad_norm": 0.353515625, "learning_rate": 0.00015281467457271909, "loss": 0.8415, "step": 3975 }, { "epoch": 0.7822327044025157, "grad_norm": 0.310546875, "learning_rate": 0.0001526689487129775, "loss": 0.813, "step": 3980 }, { "epoch": 0.7832154088050315, "grad_norm": 0.318359375, "learning_rate": 0.00015252306790272267, "loss": 0.8125, "step": 3985 }, { "epoch": 0.7841981132075472, "grad_norm": 0.376953125, "learning_rate": 0.00015237703257113173, "loss": 0.8013, "step": 3990 }, { "epoch": 0.7851808176100629, "grad_norm": 0.423828125, "learning_rate": 0.0001522308431478364, "loss": 0.812, "step": 3995 }, { "epoch": 0.7861635220125787, "grad_norm": 0.43359375, "learning_rate": 0.00015208450006292163, "loss": 0.8184, "step": 4000 }, { "epoch": 0.7871462264150944, "grad_norm": 0.34765625, "learning_rate": 0.00015193800374692457, "loss": 0.8145, "step": 4005 }, { "epoch": 0.78812893081761, "grad_norm": 0.349609375, "learning_rate": 0.0001517913546308331, "loss": 0.8209, "step": 4010 }, { "epoch": 0.7891116352201258, "grad_norm": 0.3359375, "learning_rate": 0.00015164455314608467, "loss": 0.8417, "step": 4015 }, { "epoch": 0.7900943396226415, "grad_norm": 0.345703125, "learning_rate": 0.0001514975997245649, "loss": 0.8193, "step": 4020 }, { "epoch": 0.7910770440251572, "grad_norm": 0.349609375, "learning_rate": 0.00015135049479860657, "loss": 0.8271, "step": 4025 }, { "epoch": 0.7920597484276729, "grad_norm": 0.33984375, "learning_rate": 0.00015120323880098803, "loss": 0.8001, "step": 4030 }, { "epoch": 0.7930424528301887, "grad_norm": 0.330078125, "learning_rate": 0.00015105583216493216, "loss": 0.793, "step": 4035 }, { "epoch": 0.7940251572327044, "grad_norm": 0.314453125, "learning_rate": 0.00015090827532410499, "loss": 0.7943, "step": 4040 }, { "epoch": 0.7950078616352201, "grad_norm": 0.33984375, "learning_rate": 0.00015076056871261444, "loss": 0.8088, "step": 4045 }, { "epoch": 0.7959905660377359, "grad_norm": 0.380859375, "learning_rate": 0.00015061271276500904, "loss": 0.8059, "step": 4050 }, { "epoch": 0.7969732704402516, "grad_norm": 0.314453125, "learning_rate": 0.00015046470791627668, "loss": 0.8052, "step": 4055 }, { "epoch": 0.7979559748427673, "grad_norm": 0.318359375, "learning_rate": 0.00015031655460184337, "loss": 0.7997, "step": 4060 }, { "epoch": 0.7989386792452831, "grad_norm": 0.369140625, "learning_rate": 0.00015016825325757182, "loss": 0.8233, "step": 4065 }, { "epoch": 0.7999213836477987, "grad_norm": 0.3671875, "learning_rate": 0.00015001980431976022, "loss": 0.8133, "step": 4070 }, { "epoch": 0.8009040880503144, "grad_norm": 0.2890625, "learning_rate": 0.0001498712082251411, "loss": 0.8027, "step": 4075 }, { "epoch": 0.8018867924528302, "grad_norm": 0.36328125, "learning_rate": 0.00014972246541087978, "loss": 0.8174, "step": 4080 }, { "epoch": 0.8028694968553459, "grad_norm": 0.515625, "learning_rate": 0.00014957357631457333, "loss": 0.8162, "step": 4085 }, { "epoch": 0.8038522012578616, "grad_norm": 0.310546875, "learning_rate": 0.00014942454137424914, "loss": 0.8031, "step": 4090 }, { "epoch": 0.8048349056603774, "grad_norm": 0.451171875, "learning_rate": 0.00014927536102836357, "loss": 0.8032, "step": 4095 }, { "epoch": 0.8058176100628931, "grad_norm": 0.48828125, "learning_rate": 0.00014912603571580097, "loss": 0.826, "step": 4100 }, { "epoch": 0.8068003144654088, "grad_norm": 0.4453125, "learning_rate": 0.00014897656587587198, "loss": 0.791, "step": 4105 }, { "epoch": 0.8077830188679245, "grad_norm": 0.337890625, "learning_rate": 0.00014882695194831256, "loss": 0.8137, "step": 4110 }, { "epoch": 0.8087657232704403, "grad_norm": 0.353515625, "learning_rate": 0.00014867719437328252, "loss": 0.8015, "step": 4115 }, { "epoch": 0.809748427672956, "grad_norm": 0.361328125, "learning_rate": 0.00014852729359136432, "loss": 0.8189, "step": 4120 }, { "epoch": 0.8107311320754716, "grad_norm": 0.314453125, "learning_rate": 0.0001483772500435616, "loss": 0.8096, "step": 4125 }, { "epoch": 0.8117138364779874, "grad_norm": 0.32421875, "learning_rate": 0.0001482270641712982, "loss": 0.8014, "step": 4130 }, { "epoch": 0.8126965408805031, "grad_norm": 0.322265625, "learning_rate": 0.00014807673641641653, "loss": 0.8062, "step": 4135 }, { "epoch": 0.8136792452830188, "grad_norm": 0.3125, "learning_rate": 0.0001479262672211765, "loss": 0.8215, "step": 4140 }, { "epoch": 0.8146619496855346, "grad_norm": 0.384765625, "learning_rate": 0.00014777565702825407, "loss": 0.8257, "step": 4145 }, { "epoch": 0.8156446540880503, "grad_norm": 0.48828125, "learning_rate": 0.00014762490628074005, "loss": 0.815, "step": 4150 }, { "epoch": 0.816627358490566, "grad_norm": 0.41796875, "learning_rate": 0.00014747401542213875, "loss": 0.8213, "step": 4155 }, { "epoch": 0.8176100628930818, "grad_norm": 0.310546875, "learning_rate": 0.0001473229848963667, "loss": 0.8136, "step": 4160 }, { "epoch": 0.8185927672955975, "grad_norm": 0.34375, "learning_rate": 0.00014717181514775128, "loss": 0.796, "step": 4165 }, { "epoch": 0.8195754716981132, "grad_norm": 0.314453125, "learning_rate": 0.00014702050662102948, "loss": 0.8168, "step": 4170 }, { "epoch": 0.820558176100629, "grad_norm": 0.3359375, "learning_rate": 0.00014686905976134663, "loss": 0.8266, "step": 4175 }, { "epoch": 0.8215408805031447, "grad_norm": 0.30859375, "learning_rate": 0.00014671747501425497, "loss": 0.7863, "step": 4180 }, { "epoch": 0.8225235849056604, "grad_norm": 0.349609375, "learning_rate": 0.00014656575282571234, "loss": 0.8031, "step": 4185 }, { "epoch": 0.8235062893081762, "grad_norm": 0.3359375, "learning_rate": 0.00014641389364208107, "loss": 0.7824, "step": 4190 }, { "epoch": 0.8244889937106918, "grad_norm": 0.330078125, "learning_rate": 0.00014626189791012647, "loss": 0.8233, "step": 4195 }, { "epoch": 0.8254716981132075, "grad_norm": 0.3203125, "learning_rate": 0.0001461097660770155, "loss": 0.8177, "step": 4200 }, { "epoch": 0.8264544025157232, "grad_norm": 0.3671875, "learning_rate": 0.00014595749859031557, "loss": 0.8038, "step": 4205 }, { "epoch": 0.827437106918239, "grad_norm": 0.3828125, "learning_rate": 0.00014580509589799329, "loss": 0.8032, "step": 4210 }, { "epoch": 0.8284198113207547, "grad_norm": 0.32421875, "learning_rate": 0.00014565255844841286, "loss": 0.8246, "step": 4215 }, { "epoch": 0.8294025157232704, "grad_norm": 0.32421875, "learning_rate": 0.000145499886690335, "loss": 0.7918, "step": 4220 }, { "epoch": 0.8303852201257862, "grad_norm": 0.48828125, "learning_rate": 0.00014534708107291565, "loss": 0.8305, "step": 4225 }, { "epoch": 0.8313679245283019, "grad_norm": 0.294921875, "learning_rate": 0.00014519414204570446, "loss": 0.8086, "step": 4230 }, { "epoch": 0.8323506289308176, "grad_norm": 0.3984375, "learning_rate": 0.00014504107005864353, "loss": 0.7889, "step": 4235 }, { "epoch": 0.8333333333333334, "grad_norm": 0.3203125, "learning_rate": 0.00014488786556206626, "loss": 0.8268, "step": 4240 }, { "epoch": 0.8343160377358491, "grad_norm": 0.345703125, "learning_rate": 0.0001447345290066958, "loss": 0.8001, "step": 4245 }, { "epoch": 0.8352987421383647, "grad_norm": 0.3046875, "learning_rate": 0.00014458106084364383, "loss": 0.8205, "step": 4250 }, { "epoch": 0.8362814465408805, "grad_norm": 0.390625, "learning_rate": 0.0001444274615244092, "loss": 0.8076, "step": 4255 }, { "epoch": 0.8372641509433962, "grad_norm": 0.51171875, "learning_rate": 0.00014427373150087663, "loss": 0.8025, "step": 4260 }, { "epoch": 0.8382468553459119, "grad_norm": 0.318359375, "learning_rate": 0.00014411987122531542, "loss": 0.7983, "step": 4265 }, { "epoch": 0.8392295597484277, "grad_norm": 0.388671875, "learning_rate": 0.000143965881150378, "loss": 0.8006, "step": 4270 }, { "epoch": 0.8402122641509434, "grad_norm": 0.66015625, "learning_rate": 0.00014381176172909862, "loss": 0.8044, "step": 4275 }, { "epoch": 0.8411949685534591, "grad_norm": 0.49609375, "learning_rate": 0.00014365751341489222, "loss": 0.8143, "step": 4280 }, { "epoch": 0.8421776729559748, "grad_norm": 0.435546875, "learning_rate": 0.0001435031366615528, "loss": 0.7867, "step": 4285 }, { "epoch": 0.8431603773584906, "grad_norm": 0.30859375, "learning_rate": 0.00014334863192325226, "loss": 0.8115, "step": 4290 }, { "epoch": 0.8441430817610063, "grad_norm": 0.30859375, "learning_rate": 0.00014319399965453911, "loss": 0.8234, "step": 4295 }, { "epoch": 0.845125786163522, "grad_norm": 0.40234375, "learning_rate": 0.00014303924031033692, "loss": 0.8317, "step": 4300 }, { "epoch": 0.8461084905660378, "grad_norm": 0.359375, "learning_rate": 0.00014288435434594315, "loss": 0.8321, "step": 4305 }, { "epoch": 0.8470911949685535, "grad_norm": 0.322265625, "learning_rate": 0.00014272934221702788, "loss": 0.8127, "step": 4310 }, { "epoch": 0.8480738993710691, "grad_norm": 0.310546875, "learning_rate": 0.00014257420437963222, "loss": 0.801, "step": 4315 }, { "epoch": 0.8490566037735849, "grad_norm": 0.349609375, "learning_rate": 0.00014241894129016718, "loss": 0.8139, "step": 4320 }, { "epoch": 0.8500393081761006, "grad_norm": 0.33203125, "learning_rate": 0.00014226355340541224, "loss": 0.7854, "step": 4325 }, { "epoch": 0.8510220125786163, "grad_norm": 0.3125, "learning_rate": 0.00014210804118251405, "loss": 0.8163, "step": 4330 }, { "epoch": 0.8520047169811321, "grad_norm": 0.37890625, "learning_rate": 0.00014195240507898504, "loss": 0.829, "step": 4335 }, { "epoch": 0.8529874213836478, "grad_norm": 0.396484375, "learning_rate": 0.00014179664555270206, "loss": 0.7911, "step": 4340 }, { "epoch": 0.8539701257861635, "grad_norm": 0.322265625, "learning_rate": 0.00014164076306190517, "loss": 0.8278, "step": 4345 }, { "epoch": 0.8549528301886793, "grad_norm": 0.314453125, "learning_rate": 0.00014148475806519603, "loss": 0.8158, "step": 4350 }, { "epoch": 0.855935534591195, "grad_norm": 0.390625, "learning_rate": 0.00014132863102153683, "loss": 0.8281, "step": 4355 }, { "epoch": 0.8569182389937107, "grad_norm": 0.427734375, "learning_rate": 0.00014117238239024887, "loss": 0.8236, "step": 4360 }, { "epoch": 0.8579009433962265, "grad_norm": 0.306640625, "learning_rate": 0.00014101601263101095, "loss": 0.7968, "step": 4365 }, { "epoch": 0.8588836477987422, "grad_norm": 0.33203125, "learning_rate": 0.00014085952220385838, "loss": 0.8183, "step": 4370 }, { "epoch": 0.8598663522012578, "grad_norm": 0.3125, "learning_rate": 0.0001407029115691815, "loss": 0.7929, "step": 4375 }, { "epoch": 0.8608490566037735, "grad_norm": 0.34765625, "learning_rate": 0.00014054618118772416, "loss": 0.8169, "step": 4380 }, { "epoch": 0.8618317610062893, "grad_norm": 0.359375, "learning_rate": 0.00014038933152058262, "loss": 0.7911, "step": 4385 }, { "epoch": 0.862814465408805, "grad_norm": 0.349609375, "learning_rate": 0.000140232363029204, "loss": 0.8171, "step": 4390 }, { "epoch": 0.8637971698113207, "grad_norm": 0.3359375, "learning_rate": 0.00014007527617538508, "loss": 0.8039, "step": 4395 }, { "epoch": 0.8647798742138365, "grad_norm": 0.314453125, "learning_rate": 0.0001399180714212708, "loss": 0.82, "step": 4400 }, { "epoch": 0.8657625786163522, "grad_norm": 0.330078125, "learning_rate": 0.0001397607492293529, "loss": 0.8094, "step": 4405 }, { "epoch": 0.8667452830188679, "grad_norm": 0.345703125, "learning_rate": 0.00013960331006246878, "loss": 0.8232, "step": 4410 }, { "epoch": 0.8677279874213837, "grad_norm": 0.34765625, "learning_rate": 0.00013944575438379984, "loss": 0.8301, "step": 4415 }, { "epoch": 0.8687106918238994, "grad_norm": 0.31640625, "learning_rate": 0.00013928808265687028, "loss": 0.8081, "step": 4420 }, { "epoch": 0.8696933962264151, "grad_norm": 0.326171875, "learning_rate": 0.00013913029534554574, "loss": 0.7712, "step": 4425 }, { "epoch": 0.8706761006289309, "grad_norm": 0.35546875, "learning_rate": 0.00013897239291403191, "loss": 0.8289, "step": 4430 }, { "epoch": 0.8716588050314465, "grad_norm": 0.375, "learning_rate": 0.00013881437582687314, "loss": 0.7992, "step": 4435 }, { "epoch": 0.8726415094339622, "grad_norm": 0.41015625, "learning_rate": 0.0001386562445489511, "loss": 0.8034, "step": 4440 }, { "epoch": 0.873624213836478, "grad_norm": 0.4765625, "learning_rate": 0.0001384979995454834, "loss": 0.7996, "step": 4445 }, { "epoch": 0.8746069182389937, "grad_norm": 0.474609375, "learning_rate": 0.00013833964128202224, "loss": 0.8168, "step": 4450 }, { "epoch": 0.8755896226415094, "grad_norm": 0.349609375, "learning_rate": 0.00013818117022445297, "loss": 0.7952, "step": 4455 }, { "epoch": 0.8765723270440252, "grad_norm": 0.353515625, "learning_rate": 0.0001380225868389929, "loss": 0.8389, "step": 4460 }, { "epoch": 0.8775550314465409, "grad_norm": 0.322265625, "learning_rate": 0.0001378638915921897, "loss": 0.7979, "step": 4465 }, { "epoch": 0.8785377358490566, "grad_norm": 0.37109375, "learning_rate": 0.00013770508495092014, "loss": 0.8099, "step": 4470 }, { "epoch": 0.8795204402515723, "grad_norm": 0.314453125, "learning_rate": 0.00013754616738238877, "loss": 0.7925, "step": 4475 }, { "epoch": 0.8805031446540881, "grad_norm": 0.427734375, "learning_rate": 0.00013738713935412643, "loss": 0.8164, "step": 4480 }, { "epoch": 0.8814858490566038, "grad_norm": 0.4296875, "learning_rate": 0.00013722800133398897, "loss": 0.8104, "step": 4485 }, { "epoch": 0.8824685534591195, "grad_norm": 0.3125, "learning_rate": 0.00013706875379015577, "loss": 0.8087, "step": 4490 }, { "epoch": 0.8834512578616353, "grad_norm": 0.515625, "learning_rate": 0.0001369093971911285, "loss": 0.8037, "step": 4495 }, { "epoch": 0.8844339622641509, "grad_norm": 0.443359375, "learning_rate": 0.00013674993200572962, "loss": 0.8024, "step": 4500 }, { "epoch": 0.8854166666666666, "grad_norm": 0.43359375, "learning_rate": 7.893462771773996e-06, "loss": 0.782, "step": 4505 }, { "epoch": 0.8863993710691824, "grad_norm": 0.337890625, "learning_rate": 7.760421092313152e-06, "loss": 0.7891, "step": 4510 }, { "epoch": 0.8873820754716981, "grad_norm": 0.326171875, "learning_rate": 7.628464876673202e-06, "loss": 0.8201, "step": 4515 }, { "epoch": 0.8883647798742138, "grad_norm": 0.3203125, "learning_rate": 7.497595677698388e-06, "loss": 0.8031, "step": 4520 }, { "epoch": 0.8893474842767296, "grad_norm": 0.32421875, "learning_rate": 7.3678150354410615e-06, "loss": 0.8013, "step": 4525 }, { "epoch": 0.8903301886792453, "grad_norm": 0.326171875, "learning_rate": 7.239124477143578e-06, "loss": 0.8075, "step": 4530 }, { "epoch": 0.891312893081761, "grad_norm": 0.310546875, "learning_rate": 7.111525517220308e-06, "loss": 0.7919, "step": 4535 }, { "epoch": 0.8922955974842768, "grad_norm": 0.298828125, "learning_rate": 6.985019657239867e-06, "loss": 0.8074, "step": 4540 }, { "epoch": 0.8932783018867925, "grad_norm": 0.314453125, "learning_rate": 6.859608385907379e-06, "loss": 0.8009, "step": 4545 }, { "epoch": 0.8942610062893082, "grad_norm": 0.310546875, "learning_rate": 6.735293179046975e-06, "loss": 0.8081, "step": 4550 }, { "epoch": 0.8952437106918238, "grad_norm": 0.3046875, "learning_rate": 6.612075499584458e-06, "loss": 0.8067, "step": 4555 }, { "epoch": 0.8962264150943396, "grad_norm": 0.3125, "learning_rate": 6.489956797530084e-06, "loss": 0.811, "step": 4560 }, { "epoch": 0.8972091194968553, "grad_norm": 0.30078125, "learning_rate": 6.368938509961398e-06, "loss": 0.7966, "step": 4565 }, { "epoch": 0.898191823899371, "grad_norm": 0.328125, "learning_rate": 6.2490220610065155e-06, "loss": 0.8123, "step": 4570 }, { "epoch": 0.8991745283018868, "grad_norm": 0.30859375, "learning_rate": 6.130208861827202e-06, "loss": 0.8045, "step": 4575 }, { "epoch": 0.9001572327044025, "grad_norm": 0.302734375, "learning_rate": 6.012500310602254e-06, "loss": 0.7923, "step": 4580 }, { "epoch": 0.9011399371069182, "grad_norm": 0.30859375, "learning_rate": 5.8958977925112405e-06, "loss": 0.7986, "step": 4585 }, { "epoch": 0.902122641509434, "grad_norm": 0.322265625, "learning_rate": 5.780402679717989e-06, "loss": 0.8166, "step": 4590 }, { "epoch": 0.9031053459119497, "grad_norm": 0.298828125, "learning_rate": 5.666016331354485e-06, "loss": 0.7845, "step": 4595 }, { "epoch": 0.9040880503144654, "grad_norm": 0.330078125, "learning_rate": 5.552740093505015e-06, "loss": 0.7865, "step": 4600 }, { "epoch": 0.9050707547169812, "grad_norm": 0.30859375, "learning_rate": 5.440575299190165e-06, "loss": 0.8243, "step": 4605 }, { "epoch": 0.9060534591194969, "grad_norm": 0.31640625, "learning_rate": 5.329523268351155e-06, "loss": 0.8041, "step": 4610 }, { "epoch": 0.9070361635220126, "grad_norm": 0.310546875, "learning_rate": 5.219585307834407e-06, "loss": 0.8057, "step": 4615 }, { "epoch": 0.9080188679245284, "grad_norm": 0.294921875, "learning_rate": 5.110762711376116e-06, "loss": 0.7987, "step": 4620 }, { "epoch": 0.909001572327044, "grad_norm": 0.326171875, "learning_rate": 5.003056759586944e-06, "loss": 0.7983, "step": 4625 }, { "epoch": 0.9099842767295597, "grad_norm": 0.3203125, "learning_rate": 4.89646871993703e-06, "loss": 0.7872, "step": 4630 }, { "epoch": 0.9109669811320755, "grad_norm": 0.31640625, "learning_rate": 4.79099984674114e-06, "loss": 0.8203, "step": 4635 }, { "epoch": 0.9119496855345912, "grad_norm": 0.318359375, "learning_rate": 4.6866513811437475e-06, "loss": 0.7816, "step": 4640 }, { "epoch": 0.9129323899371069, "grad_norm": 0.30078125, "learning_rate": 4.58342455110452e-06, "loss": 0.8151, "step": 4645 }, { "epoch": 0.9139150943396226, "grad_norm": 0.29296875, "learning_rate": 4.481320571383907e-06, "loss": 0.8052, "step": 4650 }, { "epoch": 0.9148977987421384, "grad_norm": 0.31640625, "learning_rate": 4.380340643528735e-06, "loss": 0.8069, "step": 4655 }, { "epoch": 0.9158805031446541, "grad_norm": 0.328125, "learning_rate": 4.280485955858171e-06, "loss": 0.7986, "step": 4660 }, { "epoch": 0.9168632075471698, "grad_norm": 0.310546875, "learning_rate": 4.181757683449694e-06, "loss": 0.8219, "step": 4665 }, { "epoch": 0.9178459119496856, "grad_norm": 0.30859375, "learning_rate": 4.084156988125231e-06, "loss": 0.8162, "step": 4670 }, { "epoch": 0.9188286163522013, "grad_norm": 0.314453125, "learning_rate": 3.987685018437581e-06, "loss": 0.7972, "step": 4675 }, { "epoch": 0.9198113207547169, "grad_norm": 0.30859375, "learning_rate": 3.892342909656776e-06, "loss": 0.8163, "step": 4680 }, { "epoch": 0.9207940251572327, "grad_norm": 0.310546875, "learning_rate": 3.798131783756853e-06, "loss": 0.8151, "step": 4685 }, { "epoch": 0.9217767295597484, "grad_norm": 0.310546875, "learning_rate": 3.7050527494025265e-06, "loss": 0.8023, "step": 4690 }, { "epoch": 0.9227594339622641, "grad_norm": 0.322265625, "learning_rate": 3.6131069019362362e-06, "loss": 0.8229, "step": 4695 }, { "epoch": 0.9237421383647799, "grad_norm": 0.302734375, "learning_rate": 3.52229532336521e-06, "loss": 0.7951, "step": 4700 }, { "epoch": 0.9247248427672956, "grad_norm": 0.314453125, "learning_rate": 3.4326190823487315e-06, "loss": 0.8034, "step": 4705 }, { "epoch": 0.9257075471698113, "grad_norm": 0.30859375, "learning_rate": 3.344079234185604e-06, "loss": 0.807, "step": 4710 }, { "epoch": 0.9266902515723271, "grad_norm": 0.306640625, "learning_rate": 3.2566768208016297e-06, "loss": 0.8122, "step": 4715 }, { "epoch": 0.9276729559748428, "grad_norm": 0.30859375, "learning_rate": 3.170412870737516e-06, "loss": 0.8023, "step": 4720 }, { "epoch": 0.9286556603773585, "grad_norm": 0.3046875, "learning_rate": 3.0852883991366322e-06, "loss": 0.7757, "step": 4725 }, { "epoch": 0.9296383647798742, "grad_norm": 0.306640625, "learning_rate": 3.0013044077330744e-06, "loss": 0.7709, "step": 4730 }, { "epoch": 0.93062106918239, "grad_norm": 0.322265625, "learning_rate": 2.9184618848399627e-06, "loss": 0.8331, "step": 4735 }, { "epoch": 0.9316037735849056, "grad_norm": 0.3125, "learning_rate": 2.836761805337762e-06, "loss": 0.7819, "step": 4740 }, { "epoch": 0.9325864779874213, "grad_norm": 0.33984375, "learning_rate": 2.756205130662737e-06, "loss": 0.7949, "step": 4745 }, { "epoch": 0.9335691823899371, "grad_norm": 0.31640625, "learning_rate": 2.6767928087957693e-06, "loss": 0.8147, "step": 4750 }, { "epoch": 0.9345518867924528, "grad_norm": 0.30078125, "learning_rate": 2.598525774251159e-06, "loss": 0.7786, "step": 4755 }, { "epoch": 0.9355345911949685, "grad_norm": 0.302734375, "learning_rate": 2.52140494806552e-06, "loss": 0.7954, "step": 4760 }, { "epoch": 0.9365172955974843, "grad_norm": 0.30859375, "learning_rate": 2.44543123778711e-06, "loss": 0.7851, "step": 4765 }, { "epoch": 0.9375, "grad_norm": 0.3046875, "learning_rate": 2.370605537465065e-06, "loss": 0.81, "step": 4770 }, { "epoch": 0.9384827044025157, "grad_norm": 0.302734375, "learning_rate": 2.296928727638814e-06, "loss": 0.8305, "step": 4775 }, { "epoch": 0.9394654088050315, "grad_norm": 0.3046875, "learning_rate": 2.2244016753278586e-06, "loss": 0.7896, "step": 4780 }, { "epoch": 0.9404481132075472, "grad_norm": 0.3046875, "learning_rate": 2.1530252340214996e-06, "loss": 0.8101, "step": 4785 }, { "epoch": 0.9414308176100629, "grad_norm": 0.31640625, "learning_rate": 2.0828002436687257e-06, "loss": 0.805, "step": 4790 }, { "epoch": 0.9424135220125787, "grad_norm": 0.310546875, "learning_rate": 2.013727530668452e-06, "loss": 0.804, "step": 4795 }, { "epoch": 0.9433962264150944, "grad_norm": 0.314453125, "learning_rate": 1.9458079078597203e-06, "loss": 0.825, "step": 4800 }, { "epoch": 0.94437893081761, "grad_norm": 0.3046875, "learning_rate": 1.8790421745121356e-06, "loss": 0.821, "step": 4805 }, { "epoch": 0.9453616352201258, "grad_norm": 0.310546875, "learning_rate": 1.813431116316522e-06, "loss": 0.8101, "step": 4810 }, { "epoch": 0.9463443396226415, "grad_norm": 0.30859375, "learning_rate": 1.748975505375583e-06, "loss": 0.8016, "step": 4815 }, { "epoch": 0.9473270440251572, "grad_norm": 0.296875, "learning_rate": 1.6856761001948772e-06, "loss": 0.7847, "step": 4820 }, { "epoch": 0.9483097484276729, "grad_norm": 0.3203125, "learning_rate": 1.6235336456739026e-06, "loss": 0.8007, "step": 4825 }, { "epoch": 0.9492924528301887, "grad_norm": 0.310546875, "learning_rate": 1.5625488730972693e-06, "loss": 0.7891, "step": 4830 }, { "epoch": 0.9502751572327044, "grad_norm": 0.30859375, "learning_rate": 1.5027225001261525e-06, "loss": 0.8244, "step": 4835 }, { "epoch": 0.9512578616352201, "grad_norm": 0.298828125, "learning_rate": 1.4440552307898202e-06, "loss": 0.7962, "step": 4840 }, { "epoch": 0.9522405660377359, "grad_norm": 0.306640625, "learning_rate": 1.386547755477363e-06, "loss": 0.7982, "step": 4845 }, { "epoch": 0.9532232704402516, "grad_norm": 0.318359375, "learning_rate": 1.3302007509295445e-06, "loss": 0.7896, "step": 4850 }, { "epoch": 0.9542059748427673, "grad_norm": 0.310546875, "learning_rate": 1.2750148802308737e-06, "loss": 0.8158, "step": 4855 }, { "epoch": 0.9551886792452831, "grad_norm": 0.3125, "learning_rate": 1.2209907928017795e-06, "loss": 0.8012, "step": 4860 }, { "epoch": 0.9561713836477987, "grad_norm": 0.310546875, "learning_rate": 1.1681291243909153e-06, "loss": 0.8146, "step": 4865 }, { "epoch": 0.9571540880503144, "grad_norm": 0.330078125, "learning_rate": 1.116430497067833e-06, "loss": 0.8175, "step": 4870 }, { "epoch": 0.9581367924528302, "grad_norm": 0.3125, "learning_rate": 1.0658955192154763e-06, "loss": 0.7937, "step": 4875 }, { "epoch": 0.9591194968553459, "grad_norm": 0.3125, "learning_rate": 1.0165247855231542e-06, "loss": 0.8, "step": 4880 }, { "epoch": 0.9601022012578616, "grad_norm": 0.314453125, "learning_rate": 9.683188769794792e-07, "loss": 0.8042, "step": 4885 }, { "epoch": 0.9610849056603774, "grad_norm": 0.298828125, "learning_rate": 9.212783608655518e-07, "loss": 0.8078, "step": 4890 }, { "epoch": 0.9620676100628931, "grad_norm": 0.31640625, "learning_rate": 8.754037907482748e-07, "loss": 0.7992, "step": 4895 }, { "epoch": 0.9630503144654088, "grad_norm": 0.306640625, "learning_rate": 8.306957064738385e-07, "loss": 0.806, "step": 4900 }, { "epoch": 0.9640330188679245, "grad_norm": 0.31640625, "learning_rate": 7.871546341614023e-07, "loss": 0.7803, "step": 4905 }, { "epoch": 0.9650157232704403, "grad_norm": 0.3046875, "learning_rate": 7.447810861968552e-07, "loss": 0.7864, "step": 4910 }, { "epoch": 0.965998427672956, "grad_norm": 0.30859375, "learning_rate": 7.03575561226788e-07, "loss": 0.7837, "step": 4915 }, { "epoch": 0.9669811320754716, "grad_norm": 0.302734375, "learning_rate": 6.635385441526754e-07, "loss": 0.7935, "step": 4920 }, { "epoch": 0.9679638364779874, "grad_norm": 0.314453125, "learning_rate": 6.246705061251245e-07, "loss": 0.8074, "step": 4925 }, { "epoch": 0.9689465408805031, "grad_norm": 0.298828125, "learning_rate": 5.86971904538347e-07, "loss": 0.8082, "step": 4930 }, { "epoch": 0.9699292452830188, "grad_norm": 0.3125, "learning_rate": 5.504431830247514e-07, "loss": 0.7889, "step": 4935 }, { "epoch": 0.9709119496855346, "grad_norm": 0.306640625, "learning_rate": 5.150847714497697e-07, "loss": 0.7924, "step": 4940 }, { "epoch": 0.9718946540880503, "grad_norm": 0.296875, "learning_rate": 4.80897085906773e-07, "loss": 0.81, "step": 4945 }, { "epoch": 0.972877358490566, "grad_norm": 0.294921875, "learning_rate": 4.4788052871215234e-07, "loss": 0.805, "step": 4950 }, { "epoch": 0.9738600628930818, "grad_norm": 0.30078125, "learning_rate": 4.1603548840062345e-07, "loss": 0.8101, "step": 4955 }, { "epoch": 0.9748427672955975, "grad_norm": 0.3046875, "learning_rate": 3.853623397206407e-07, "loss": 0.7909, "step": 4960 }, { "epoch": 0.9758254716981132, "grad_norm": 0.302734375, "learning_rate": 3.5586144362997896e-07, "loss": 0.7972, "step": 4965 }, { "epoch": 0.976808176100629, "grad_norm": 0.314453125, "learning_rate": 3.275331472914922e-07, "loss": 0.8101, "step": 4970 }, { "epoch": 0.9777908805031447, "grad_norm": 0.3125, "learning_rate": 3.0037778406902805e-07, "loss": 0.8184, "step": 4975 }, { "epoch": 0.9787735849056604, "grad_norm": 0.3125, "learning_rate": 2.743956735234865e-07, "loss": 0.782, "step": 4980 }, { "epoch": 0.9797562893081762, "grad_norm": 0.322265625, "learning_rate": 2.4958712140911166e-07, "loss": 0.7905, "step": 4985 }, { "epoch": 0.9807389937106918, "grad_norm": 0.310546875, "learning_rate": 2.2595241966982817e-07, "loss": 0.8163, "step": 4990 }, { "epoch": 0.9817216981132075, "grad_norm": 0.3125, "learning_rate": 2.0349184643586595e-07, "loss": 0.8266, "step": 4995 }, { "epoch": 0.9827044025157232, "grad_norm": 0.30859375, "learning_rate": 1.8220566602040745e-07, "loss": 0.8174, "step": 5000 }, { "epoch": 0.983687106918239, "grad_norm": 0.302734375, "learning_rate": 1.6209412891659003e-07, "loss": 0.8052, "step": 5005 }, { "epoch": 0.9846698113207547, "grad_norm": 0.302734375, "learning_rate": 1.4315747179446392e-07, "loss": 0.7871, "step": 5010 }, { "epoch": 0.9856525157232704, "grad_norm": 0.318359375, "learning_rate": 1.2539591749821666e-07, "loss": 0.7974, "step": 5015 }, { "epoch": 0.9866352201257862, "grad_norm": 0.33203125, "learning_rate": 1.088096750436085e-07, "loss": 0.7972, "step": 5020 }, { "epoch": 0.9876179245283019, "grad_norm": 0.31640625, "learning_rate": 9.339893961548551e-08, "loss": 0.8153, "step": 5025 }, { "epoch": 0.9886006289308176, "grad_norm": 0.310546875, "learning_rate": 7.916389256541479e-08, "loss": 0.8146, "step": 5030 }, { "epoch": 0.9895833333333334, "grad_norm": 0.302734375, "learning_rate": 6.610470140967495e-08, "loss": 0.81, "step": 5035 }, { "epoch": 0.9905660377358491, "grad_norm": 0.310546875, "learning_rate": 5.422151982719115e-08, "loss": 0.8167, "step": 5040 }, { "epoch": 0.9915487421383647, "grad_norm": 0.330078125, "learning_rate": 4.351448765775867e-08, "loss": 0.8175, "step": 5045 }, { "epoch": 0.9925314465408805, "grad_norm": 0.310546875, "learning_rate": 3.3983730900377655e-08, "loss": 0.8008, "step": 5050 }, { "epoch": 0.9935141509433962, "grad_norm": 0.302734375, "learning_rate": 2.5629361711809742e-08, "loss": 0.8024, "step": 5055 }, { "epoch": 0.9944968553459119, "grad_norm": 0.30078125, "learning_rate": 1.8451478405223653e-08, "loss": 0.7952, "step": 5060 }, { "epoch": 0.9954795597484277, "grad_norm": 0.314453125, "learning_rate": 1.2450165449062744e-08, "loss": 0.7893, "step": 5065 }, { "epoch": 0.9964622641509434, "grad_norm": 0.306640625, "learning_rate": 7.62549346601249e-09, "loss": 0.8112, "step": 5070 }, { "epoch": 0.9974449685534591, "grad_norm": 0.302734375, "learning_rate": 3.977519232223337e-09, "loss": 0.8175, "step": 5075 }, { "epoch": 0.9984276729559748, "grad_norm": 0.302734375, "learning_rate": 1.5062856765779565e-09, "loss": 0.8088, "step": 5080 }, { "epoch": 0.9994103773584906, "grad_norm": 0.310546875, "learning_rate": 2.118218802582561e-10, "loss": 0.8288, "step": 5085 }, { "epoch": 1.0, "eval_loss": 0.7983007431030273, "eval_runtime": 9224.1097, "eval_samples_per_second": 8.823, "eval_steps_per_second": 0.138, "step": 5088 }, { "epoch": 1.0, "step": 5088, "total_flos": 1.5751056572484157e+19, "train_loss": 0.013955340304839536, "train_runtime": 11108.4839, "train_samples_per_second": 29.312, "train_steps_per_second": 0.458 } ], "logging_steps": 5, "max_steps": 5088, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5751056572484157e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }