{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2010, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004975124378109452, "grad_norm": 11.875, "learning_rate": 9.950248756218907e-07, "loss": 4.8907, "step": 1 }, { "epoch": 0.0024875621890547263, "grad_norm": 13.125, "learning_rate": 4.975124378109453e-06, "loss": 4.8447, "step": 5 }, { "epoch": 0.004975124378109453, "grad_norm": 12.5, "learning_rate": 9.950248756218906e-06, "loss": 4.9268, "step": 10 }, { "epoch": 0.007462686567164179, "grad_norm": 12.375, "learning_rate": 1.4925373134328357e-05, "loss": 4.9396, "step": 15 }, { "epoch": 0.009950248756218905, "grad_norm": 14.6875, "learning_rate": 1.990049751243781e-05, "loss": 5.0521, "step": 20 }, { "epoch": 0.012437810945273632, "grad_norm": 12.625, "learning_rate": 2.4875621890547266e-05, "loss": 4.7898, "step": 25 }, { "epoch": 0.014925373134328358, "grad_norm": 12.875, "learning_rate": 2.9850746268656714e-05, "loss": 4.8798, "step": 30 }, { "epoch": 0.017412935323383085, "grad_norm": 11.9375, "learning_rate": 3.4825870646766175e-05, "loss": 4.8204, "step": 35 }, { "epoch": 0.01990049751243781, "grad_norm": 11.6875, "learning_rate": 3.980099502487562e-05, "loss": 4.684, "step": 40 }, { "epoch": 0.022388059701492536, "grad_norm": 13.1875, "learning_rate": 4.477611940298508e-05, "loss": 4.6143, "step": 45 }, { "epoch": 0.024875621890547265, "grad_norm": 10.6875, "learning_rate": 4.975124378109453e-05, "loss": 4.5667, "step": 50 }, { "epoch": 0.02736318407960199, "grad_norm": 12.5625, "learning_rate": 5.472636815920398e-05, "loss": 4.6985, "step": 55 }, { "epoch": 0.029850746268656716, "grad_norm": 13.375, "learning_rate": 5.970149253731343e-05, "loss": 4.5517, "step": 60 }, { "epoch": 0.03233830845771144, "grad_norm": 11.5625, "learning_rate": 6.46766169154229e-05, "loss": 4.5351, "step": 65 }, { "epoch": 0.03482587064676617, "grad_norm": 11.5625, "learning_rate": 6.965174129353235e-05, "loss": 4.454, "step": 70 }, { "epoch": 0.03731343283582089, "grad_norm": 11.125, "learning_rate": 7.46268656716418e-05, "loss": 4.4039, "step": 75 }, { "epoch": 0.03980099502487562, "grad_norm": 10.875, "learning_rate": 7.960199004975125e-05, "loss": 4.3716, "step": 80 }, { "epoch": 0.04228855721393035, "grad_norm": 10.8125, "learning_rate": 8.45771144278607e-05, "loss": 4.3879, "step": 85 }, { "epoch": 0.04477611940298507, "grad_norm": 10.25, "learning_rate": 8.955223880597016e-05, "loss": 4.1858, "step": 90 }, { "epoch": 0.0472636815920398, "grad_norm": 10.5, "learning_rate": 9.452736318407961e-05, "loss": 4.421, "step": 95 }, { "epoch": 0.04975124378109453, "grad_norm": 10.375, "learning_rate": 9.950248756218906e-05, "loss": 4.3506, "step": 100 }, { "epoch": 0.05223880597014925, "grad_norm": 11.3125, "learning_rate": 0.0001044776119402985, "loss": 4.2671, "step": 105 }, { "epoch": 0.05472636815920398, "grad_norm": 8.625, "learning_rate": 0.00010945273631840796, "loss": 4.296, "step": 110 }, { "epoch": 0.05721393034825871, "grad_norm": 11.75, "learning_rate": 0.00011442786069651741, "loss": 4.1273, "step": 115 }, { "epoch": 0.05970149253731343, "grad_norm": 8.25, "learning_rate": 0.00011940298507462686, "loss": 4.3068, "step": 120 }, { "epoch": 0.06218905472636816, "grad_norm": 9.1875, "learning_rate": 0.0001243781094527363, "loss": 4.2837, "step": 125 }, { "epoch": 0.06467661691542288, "grad_norm": 11.75, "learning_rate": 0.0001293532338308458, "loss": 4.3174, "step": 130 }, { "epoch": 0.06716417910447761, "grad_norm": 10.0, "learning_rate": 0.00013432835820895525, "loss": 4.086, "step": 135 }, { "epoch": 0.06965174129353234, "grad_norm": 9.3125, "learning_rate": 0.0001393034825870647, "loss": 4.4375, "step": 140 }, { "epoch": 0.07213930348258707, "grad_norm": 8.8125, "learning_rate": 0.00014427860696517416, "loss": 4.2058, "step": 145 }, { "epoch": 0.07462686567164178, "grad_norm": 8.6875, "learning_rate": 0.0001492537313432836, "loss": 4.1712, "step": 150 }, { "epoch": 0.07711442786069651, "grad_norm": 9.1875, "learning_rate": 0.00015422885572139304, "loss": 4.0637, "step": 155 }, { "epoch": 0.07960199004975124, "grad_norm": 9.0, "learning_rate": 0.0001592039800995025, "loss": 4.0861, "step": 160 }, { "epoch": 0.08208955223880597, "grad_norm": 10.0625, "learning_rate": 0.00016417910447761195, "loss": 4.1879, "step": 165 }, { "epoch": 0.0845771144278607, "grad_norm": 8.5625, "learning_rate": 0.0001691542288557214, "loss": 4.2878, "step": 170 }, { "epoch": 0.08706467661691543, "grad_norm": 9.6875, "learning_rate": 0.00017412935323383086, "loss": 4.0535, "step": 175 }, { "epoch": 0.08955223880597014, "grad_norm": 8.8125, "learning_rate": 0.0001791044776119403, "loss": 4.1332, "step": 180 }, { "epoch": 0.09203980099502487, "grad_norm": 9.8125, "learning_rate": 0.00018407960199004977, "loss": 4.114, "step": 185 }, { "epoch": 0.0945273631840796, "grad_norm": 9.25, "learning_rate": 0.00018905472636815922, "loss": 4.0242, "step": 190 }, { "epoch": 0.09701492537313433, "grad_norm": 8.875, "learning_rate": 0.00019402985074626867, "loss": 4.1829, "step": 195 }, { "epoch": 0.09950248756218906, "grad_norm": 11.375, "learning_rate": 0.00019900497512437813, "loss": 4.2259, "step": 200 }, { "epoch": 0.10199004975124377, "grad_norm": 8.4375, "learning_rate": 0.000199997587258178, "loss": 4.302, "step": 205 }, { "epoch": 0.1044776119402985, "grad_norm": 10.4375, "learning_rate": 0.0001999877856940653, "loss": 4.0564, "step": 210 }, { "epoch": 0.10696517412935323, "grad_norm": 10.8125, "learning_rate": 0.00019997044524974799, "loss": 4.0902, "step": 215 }, { "epoch": 0.10945273631840796, "grad_norm": 8.125, "learning_rate": 0.00019994556723266103, "loss": 4.0716, "step": 220 }, { "epoch": 0.11194029850746269, "grad_norm": 8.0, "learning_rate": 0.00019991315351855748, "loss": 4.1908, "step": 225 }, { "epoch": 0.11442786069651742, "grad_norm": 9.0, "learning_rate": 0.00019987320655136693, "loss": 4.2773, "step": 230 }, { "epoch": 0.11691542288557213, "grad_norm": 9.6875, "learning_rate": 0.00019982572934301122, "loss": 4.1853, "step": 235 }, { "epoch": 0.11940298507462686, "grad_norm": 9.0625, "learning_rate": 0.0001997707254731775, "loss": 4.1019, "step": 240 }, { "epoch": 0.12189054726368159, "grad_norm": 9.3125, "learning_rate": 0.00019970819908904814, "loss": 4.198, "step": 245 }, { "epoch": 0.12437810945273632, "grad_norm": 9.125, "learning_rate": 0.00019963815490498817, "loss": 4.1697, "step": 250 }, { "epoch": 0.12686567164179105, "grad_norm": 10.625, "learning_rate": 0.00019956059820218982, "loss": 4.0784, "step": 255 }, { "epoch": 0.12935323383084577, "grad_norm": 9.0, "learning_rate": 0.00019947553482827418, "loss": 4.1693, "step": 260 }, { "epoch": 0.1318407960199005, "grad_norm": 8.0625, "learning_rate": 0.00019938297119685054, "loss": 4.0987, "step": 265 }, { "epoch": 0.13432835820895522, "grad_norm": 7.9375, "learning_rate": 0.00019928291428703262, "loss": 4.1734, "step": 270 }, { "epoch": 0.13681592039800994, "grad_norm": 7.5625, "learning_rate": 0.00019917537164291244, "loss": 4.0456, "step": 275 }, { "epoch": 0.13930348258706468, "grad_norm": 8.25, "learning_rate": 0.0001990603513729915, "loss": 4.1025, "step": 280 }, { "epoch": 0.1417910447761194, "grad_norm": 8.625, "learning_rate": 0.00019893786214956945, "loss": 4.1861, "step": 285 }, { "epoch": 0.14427860696517414, "grad_norm": 8.5, "learning_rate": 0.0001988079132080901, "loss": 4.1516, "step": 290 }, { "epoch": 0.14676616915422885, "grad_norm": 8.0625, "learning_rate": 0.0001986705143464453, "loss": 4.0148, "step": 295 }, { "epoch": 0.14925373134328357, "grad_norm": 7.96875, "learning_rate": 0.0001985256759242359, "loss": 3.9918, "step": 300 }, { "epoch": 0.1517412935323383, "grad_norm": 7.875, "learning_rate": 0.00019837340886199096, "loss": 4.0434, "step": 305 }, { "epoch": 0.15422885572139303, "grad_norm": 7.65625, "learning_rate": 0.00019821372464034416, "loss": 4.1499, "step": 310 }, { "epoch": 0.15671641791044777, "grad_norm": 8.75, "learning_rate": 0.00019804663529916826, "loss": 4.0495, "step": 315 }, { "epoch": 0.15920398009950248, "grad_norm": 9.6875, "learning_rate": 0.00019787215343666732, "loss": 4.0981, "step": 320 }, { "epoch": 0.16169154228855723, "grad_norm": 8.3125, "learning_rate": 0.00019769029220842677, "loss": 4.0678, "step": 325 }, { "epoch": 0.16417910447761194, "grad_norm": 8.8125, "learning_rate": 0.0001975010653264216, "loss": 4.1043, "step": 330 }, { "epoch": 0.16666666666666666, "grad_norm": 8.5, "learning_rate": 0.00019730448705798239, "loss": 4.1361, "step": 335 }, { "epoch": 0.1691542288557214, "grad_norm": 8.1875, "learning_rate": 0.00019710057222471967, "loss": 3.9753, "step": 340 }, { "epoch": 0.17164179104477612, "grad_norm": 7.875, "learning_rate": 0.00019688933620140637, "loss": 4.1972, "step": 345 }, { "epoch": 0.17412935323383086, "grad_norm": 7.4375, "learning_rate": 0.0001966707949148186, "loss": 4.0355, "step": 350 }, { "epoch": 0.17661691542288557, "grad_norm": 8.1875, "learning_rate": 0.00019644496484253474, "loss": 4.0079, "step": 355 }, { "epoch": 0.1791044776119403, "grad_norm": 8.4375, "learning_rate": 0.00019621186301169315, "loss": 3.9721, "step": 360 }, { "epoch": 0.18159203980099503, "grad_norm": 8.25, "learning_rate": 0.00019597150699770835, "loss": 4.1628, "step": 365 }, { "epoch": 0.18407960199004975, "grad_norm": 8.0625, "learning_rate": 0.0001957239149229458, "loss": 3.9472, "step": 370 }, { "epoch": 0.1865671641791045, "grad_norm": 8.125, "learning_rate": 0.00019546910545535558, "loss": 4.2425, "step": 375 }, { "epoch": 0.1890547263681592, "grad_norm": 9.0625, "learning_rate": 0.00019520709780706486, "loss": 4.1314, "step": 380 }, { "epoch": 0.19154228855721392, "grad_norm": 8.5, "learning_rate": 0.00019493791173292923, "loss": 4.0324, "step": 385 }, { "epoch": 0.19402985074626866, "grad_norm": 8.0, "learning_rate": 0.00019466156752904343, "loss": 4.0719, "step": 390 }, { "epoch": 0.19651741293532338, "grad_norm": 8.0625, "learning_rate": 0.00019437808603121087, "loss": 3.8774, "step": 395 }, { "epoch": 0.19900497512437812, "grad_norm": 9.3125, "learning_rate": 0.00019408748861337273, "loss": 4.1163, "step": 400 }, { "epoch": 0.20149253731343283, "grad_norm": 8.1875, "learning_rate": 0.00019378979718599645, "loss": 4.1658, "step": 405 }, { "epoch": 0.20398009950248755, "grad_norm": 7.625, "learning_rate": 0.0001934850341944237, "loss": 4.1059, "step": 410 }, { "epoch": 0.2064676616915423, "grad_norm": 7.9375, "learning_rate": 0.00019317322261717794, "loss": 4.0303, "step": 415 }, { "epoch": 0.208955223880597, "grad_norm": 8.25, "learning_rate": 0.00019285438596423204, "loss": 3.9217, "step": 420 }, { "epoch": 0.21144278606965175, "grad_norm": 7.375, "learning_rate": 0.00019252854827523557, "loss": 4.0601, "step": 425 }, { "epoch": 0.21393034825870647, "grad_norm": 9.6875, "learning_rate": 0.00019219573411770235, "loss": 4.025, "step": 430 }, { "epoch": 0.21641791044776118, "grad_norm": 7.71875, "learning_rate": 0.000191855968585158, "loss": 4.0082, "step": 435 }, { "epoch": 0.21890547263681592, "grad_norm": 6.46875, "learning_rate": 0.000191509277295248, "loss": 3.9052, "step": 440 }, { "epoch": 0.22139303482587064, "grad_norm": 9.3125, "learning_rate": 0.00019115568638780622, "loss": 3.8947, "step": 445 }, { "epoch": 0.22388059701492538, "grad_norm": 7.40625, "learning_rate": 0.00019079522252288386, "loss": 3.8908, "step": 450 }, { "epoch": 0.2263681592039801, "grad_norm": 8.5, "learning_rate": 0.00019042791287873957, "loss": 4.198, "step": 455 }, { "epoch": 0.22885572139303484, "grad_norm": 8.1875, "learning_rate": 0.00019005378514979008, "loss": 4.06, "step": 460 }, { "epoch": 0.23134328358208955, "grad_norm": 7.625, "learning_rate": 0.00018967286754452214, "loss": 4.1332, "step": 465 }, { "epoch": 0.23383084577114427, "grad_norm": 7.9375, "learning_rate": 0.0001892851887833657, "loss": 4.0782, "step": 470 }, { "epoch": 0.236318407960199, "grad_norm": 7.28125, "learning_rate": 0.0001888907780965284, "loss": 4.0219, "step": 475 }, { "epoch": 0.23880597014925373, "grad_norm": 7.59375, "learning_rate": 0.00018848966522179168, "loss": 4.0916, "step": 480 }, { "epoch": 0.24129353233830847, "grad_norm": 7.53125, "learning_rate": 0.00018808188040226868, "loss": 4.1352, "step": 485 }, { "epoch": 0.24378109452736318, "grad_norm": 8.25, "learning_rate": 0.00018766745438412384, "loss": 4.0799, "step": 490 }, { "epoch": 0.2462686567164179, "grad_norm": 8.375, "learning_rate": 0.00018724641841425478, "loss": 4.0501, "step": 495 }, { "epoch": 0.24875621890547264, "grad_norm": 7.90625, "learning_rate": 0.00018681880423793642, "loss": 4.0131, "step": 500 }, { "epoch": 0.2512437810945274, "grad_norm": 8.25, "learning_rate": 0.00018638464409642723, "loss": 4.2064, "step": 505 }, { "epoch": 0.2537313432835821, "grad_norm": 7.0, "learning_rate": 0.00018594397072453856, "loss": 4.1475, "step": 510 }, { "epoch": 0.2562189054726368, "grad_norm": 7.28125, "learning_rate": 0.00018549681734816623, "loss": 3.9829, "step": 515 }, { "epoch": 0.25870646766169153, "grad_norm": 7.5625, "learning_rate": 0.0001850432176817857, "loss": 4.0752, "step": 520 }, { "epoch": 0.26119402985074625, "grad_norm": 7.96875, "learning_rate": 0.00018458320592590975, "loss": 3.8724, "step": 525 }, { "epoch": 0.263681592039801, "grad_norm": 7.3125, "learning_rate": 0.00018411681676450999, "loss": 4.0854, "step": 530 }, { "epoch": 0.26616915422885573, "grad_norm": 7.84375, "learning_rate": 0.0001836440853624017, "loss": 4.0408, "step": 535 }, { "epoch": 0.26865671641791045, "grad_norm": 8.6875, "learning_rate": 0.00018316504736259255, "loss": 4.0437, "step": 540 }, { "epoch": 0.27114427860696516, "grad_norm": 6.8125, "learning_rate": 0.00018267973888359509, "loss": 4.0593, "step": 545 }, { "epoch": 0.2736318407960199, "grad_norm": 7.625, "learning_rate": 0.00018218819651670356, "loss": 3.9724, "step": 550 }, { "epoch": 0.27611940298507465, "grad_norm": 7.09375, "learning_rate": 0.00018169045732323492, "loss": 4.1018, "step": 555 }, { "epoch": 0.27860696517412936, "grad_norm": 7.75, "learning_rate": 0.00018118655883173456, "loss": 4.1389, "step": 560 }, { "epoch": 0.2810945273631841, "grad_norm": 7.75, "learning_rate": 0.0001806765390351467, "loss": 4.0369, "step": 565 }, { "epoch": 0.2835820895522388, "grad_norm": 6.84375, "learning_rate": 0.00018016043638794974, "loss": 4.1131, "step": 570 }, { "epoch": 0.2860696517412935, "grad_norm": 6.6875, "learning_rate": 0.00017963828980325697, "loss": 3.8789, "step": 575 }, { "epoch": 0.2885572139303483, "grad_norm": 8.0625, "learning_rate": 0.00017911013864988252, "loss": 4.1892, "step": 580 }, { "epoch": 0.291044776119403, "grad_norm": 7.4375, "learning_rate": 0.00017857602274937308, "loss": 4.0332, "step": 585 }, { "epoch": 0.2935323383084577, "grad_norm": 7.6875, "learning_rate": 0.00017803598237300537, "loss": 4.0141, "step": 590 }, { "epoch": 0.2960199004975124, "grad_norm": 7.34375, "learning_rate": 0.00017749005823874988, "loss": 3.9258, "step": 595 }, { "epoch": 0.29850746268656714, "grad_norm": 6.9375, "learning_rate": 0.00017693829150820068, "loss": 4.072, "step": 600 }, { "epoch": 0.3009950248756219, "grad_norm": 7.3125, "learning_rate": 0.00017638072378347203, "loss": 3.8492, "step": 605 }, { "epoch": 0.3034825870646766, "grad_norm": 7.1875, "learning_rate": 0.0001758173971040616, "loss": 3.8323, "step": 610 }, { "epoch": 0.30597014925373134, "grad_norm": 8.25, "learning_rate": 0.00017524835394368065, "loss": 3.9926, "step": 615 }, { "epoch": 0.30845771144278605, "grad_norm": 7.53125, "learning_rate": 0.00017467363720705204, "loss": 4.0593, "step": 620 }, { "epoch": 0.31094527363184077, "grad_norm": 7.90625, "learning_rate": 0.0001740932902266747, "loss": 3.8775, "step": 625 }, { "epoch": 0.31343283582089554, "grad_norm": 7.8125, "learning_rate": 0.00017350735675955697, "loss": 4.1344, "step": 630 }, { "epoch": 0.31592039800995025, "grad_norm": 8.4375, "learning_rate": 0.000172915880983917, "loss": 3.948, "step": 635 }, { "epoch": 0.31840796019900497, "grad_norm": 6.96875, "learning_rate": 0.0001723189074958521, "loss": 3.9485, "step": 640 }, { "epoch": 0.3208955223880597, "grad_norm": 7.625, "learning_rate": 0.00017171648130597612, "loss": 3.9687, "step": 645 }, { "epoch": 0.32338308457711445, "grad_norm": 6.6875, "learning_rate": 0.0001711086478360257, "loss": 4.0554, "step": 650 }, { "epoch": 0.32587064676616917, "grad_norm": 6.9375, "learning_rate": 0.0001704954529154359, "loss": 4.0395, "step": 655 }, { "epoch": 0.3283582089552239, "grad_norm": 6.71875, "learning_rate": 0.00016987694277788417, "loss": 3.9427, "step": 660 }, { "epoch": 0.3308457711442786, "grad_norm": 7.4375, "learning_rate": 0.000169253164057805, "loss": 3.9438, "step": 665 }, { "epoch": 0.3333333333333333, "grad_norm": 7.6875, "learning_rate": 0.0001686241637868734, "loss": 4.1186, "step": 670 }, { "epoch": 0.3358208955223881, "grad_norm": 6.59375, "learning_rate": 0.00016798998939045895, "loss": 4.0849, "step": 675 }, { "epoch": 0.3383084577114428, "grad_norm": 7.375, "learning_rate": 0.00016735068868404998, "loss": 3.9868, "step": 680 }, { "epoch": 0.3407960199004975, "grad_norm": 6.9375, "learning_rate": 0.0001667063098696485, "loss": 3.9275, "step": 685 }, { "epoch": 0.34328358208955223, "grad_norm": 8.125, "learning_rate": 0.0001660569015321357, "loss": 4.0451, "step": 690 }, { "epoch": 0.34577114427860695, "grad_norm": 8.3125, "learning_rate": 0.00016540251263560878, "loss": 3.9818, "step": 695 }, { "epoch": 0.3482587064676617, "grad_norm": 8.875, "learning_rate": 0.00016474319251968923, "loss": 3.9491, "step": 700 }, { "epoch": 0.35074626865671643, "grad_norm": 6.65625, "learning_rate": 0.00016407899089580262, "loss": 3.9901, "step": 705 }, { "epoch": 0.35323383084577115, "grad_norm": 7.46875, "learning_rate": 0.0001634099578434306, "loss": 3.9471, "step": 710 }, { "epoch": 0.35572139303482586, "grad_norm": 7.375, "learning_rate": 0.00016273614380633484, "loss": 3.897, "step": 715 }, { "epoch": 0.3582089552238806, "grad_norm": 8.4375, "learning_rate": 0.0001620575995887538, "loss": 3.9658, "step": 720 }, { "epoch": 0.36069651741293535, "grad_norm": 6.78125, "learning_rate": 0.00016137437635157213, "loss": 3.9457, "step": 725 }, { "epoch": 0.36318407960199006, "grad_norm": 6.96875, "learning_rate": 0.00016068652560846327, "loss": 4.143, "step": 730 }, { "epoch": 0.3656716417910448, "grad_norm": 7.21875, "learning_rate": 0.0001599940992220053, "loss": 4.0813, "step": 735 }, { "epoch": 0.3681592039800995, "grad_norm": 7.40625, "learning_rate": 0.0001592971493997709, "loss": 4.019, "step": 740 }, { "epoch": 0.3706467661691542, "grad_norm": 6.75, "learning_rate": 0.00015859572869039064, "loss": 4.0779, "step": 745 }, { "epoch": 0.373134328358209, "grad_norm": 8.1875, "learning_rate": 0.00015788988997959114, "loss": 4.1056, "step": 750 }, { "epoch": 0.3756218905472637, "grad_norm": 6.53125, "learning_rate": 0.00015717968648620764, "loss": 4.0207, "step": 755 }, { "epoch": 0.3781094527363184, "grad_norm": 7.8125, "learning_rate": 0.00015646517175817114, "loss": 4.2123, "step": 760 }, { "epoch": 0.3805970149253731, "grad_norm": 6.84375, "learning_rate": 0.00015574639966847126, "loss": 4.0826, "step": 765 }, { "epoch": 0.38308457711442784, "grad_norm": 6.71875, "learning_rate": 0.00015502342441109422, "loss": 4.0236, "step": 770 }, { "epoch": 0.3855721393034826, "grad_norm": 7.625, "learning_rate": 0.00015429630049693674, "loss": 3.9291, "step": 775 }, { "epoch": 0.3880597014925373, "grad_norm": 7.875, "learning_rate": 0.00015356508274969594, "loss": 4.0301, "step": 780 }, { "epoch": 0.39054726368159204, "grad_norm": 7.03125, "learning_rate": 0.00015282982630173585, "loss": 3.9478, "step": 785 }, { "epoch": 0.39303482587064675, "grad_norm": 6.0625, "learning_rate": 0.00015209058658993056, "loss": 3.9102, "step": 790 }, { "epoch": 0.39552238805970147, "grad_norm": 6.46875, "learning_rate": 0.0001513474193514842, "loss": 4.0111, "step": 795 }, { "epoch": 0.39800995024875624, "grad_norm": 7.15625, "learning_rate": 0.00015060038061972874, "loss": 3.9447, "step": 800 }, { "epoch": 0.40049751243781095, "grad_norm": 5.5, "learning_rate": 0.000149849526719899, "loss": 3.7303, "step": 805 }, { "epoch": 0.40298507462686567, "grad_norm": 7.21875, "learning_rate": 0.00014909491426488578, "loss": 4.1654, "step": 810 }, { "epoch": 0.4054726368159204, "grad_norm": 7.46875, "learning_rate": 0.00014833660015096766, "loss": 3.8909, "step": 815 }, { "epoch": 0.4079601990049751, "grad_norm": 6.84375, "learning_rate": 0.00014757464155352082, "loss": 3.9657, "step": 820 }, { "epoch": 0.41044776119402987, "grad_norm": 7.125, "learning_rate": 0.0001468090959227082, "loss": 3.9625, "step": 825 }, { "epoch": 0.4129353233830846, "grad_norm": 7.34375, "learning_rate": 0.00014604002097914806, "loss": 3.8299, "step": 830 }, { "epoch": 0.4154228855721393, "grad_norm": 7.15625, "learning_rate": 0.00014526747470956176, "loss": 3.9513, "step": 835 }, { "epoch": 0.417910447761194, "grad_norm": 6.09375, "learning_rate": 0.00014449151536240166, "loss": 3.828, "step": 840 }, { "epoch": 0.42039800995024873, "grad_norm": 6.0625, "learning_rate": 0.00014371220144345954, "loss": 3.9232, "step": 845 }, { "epoch": 0.4228855721393035, "grad_norm": 5.96875, "learning_rate": 0.0001429295917114551, "loss": 3.8572, "step": 850 }, { "epoch": 0.4253731343283582, "grad_norm": 7.03125, "learning_rate": 0.00014214374517360575, "loss": 3.9477, "step": 855 }, { "epoch": 0.42786069651741293, "grad_norm": 7.90625, "learning_rate": 0.00014135472108117787, "loss": 4.2486, "step": 860 }, { "epoch": 0.43034825870646765, "grad_norm": 6.34375, "learning_rate": 0.00014056257892501885, "loss": 3.9868, "step": 865 }, { "epoch": 0.43283582089552236, "grad_norm": 6.5, "learning_rate": 0.00013976737843107202, "loss": 4.1234, "step": 870 }, { "epoch": 0.43532338308457713, "grad_norm": 7.0625, "learning_rate": 0.00013896917955587328, "loss": 4.006, "step": 875 }, { "epoch": 0.43781094527363185, "grad_norm": 7.375, "learning_rate": 0.00013816804248203052, "loss": 3.9775, "step": 880 }, { "epoch": 0.44029850746268656, "grad_norm": 6.84375, "learning_rate": 0.00013736402761368598, "loss": 3.9257, "step": 885 }, { "epoch": 0.4427860696517413, "grad_norm": 6.78125, "learning_rate": 0.00013655719557196185, "loss": 3.9621, "step": 890 }, { "epoch": 0.44527363184079605, "grad_norm": 7.25, "learning_rate": 0.0001357476071903896, "loss": 3.8718, "step": 895 }, { "epoch": 0.44776119402985076, "grad_norm": 7.4375, "learning_rate": 0.0001349353235103232, "loss": 3.9892, "step": 900 }, { "epoch": 0.4502487562189055, "grad_norm": 7.625, "learning_rate": 0.00013412040577633687, "loss": 4.2505, "step": 905 }, { "epoch": 0.4527363184079602, "grad_norm": 7.03125, "learning_rate": 0.0001333029154316072, "loss": 3.9349, "step": 910 }, { "epoch": 0.4552238805970149, "grad_norm": 6.9375, "learning_rate": 0.00013248291411328047, "loss": 3.9892, "step": 915 }, { "epoch": 0.4577114427860697, "grad_norm": 5.84375, "learning_rate": 0.00013166046364782545, "loss": 3.9654, "step": 920 }, { "epoch": 0.4601990049751244, "grad_norm": 7.03125, "learning_rate": 0.0001308356260463717, "loss": 4.0497, "step": 925 }, { "epoch": 0.4626865671641791, "grad_norm": 7.34375, "learning_rate": 0.0001300084635000341, "loss": 3.8808, "step": 930 }, { "epoch": 0.4651741293532338, "grad_norm": 6.71875, "learning_rate": 0.0001291790383752237, "loss": 3.957, "step": 935 }, { "epoch": 0.46766169154228854, "grad_norm": 7.59375, "learning_rate": 0.00012834741320894553, "loss": 3.936, "step": 940 }, { "epoch": 0.4701492537313433, "grad_norm": 6.34375, "learning_rate": 0.00012751365070408333, "loss": 4.0231, "step": 945 }, { "epoch": 0.472636815920398, "grad_norm": 5.96875, "learning_rate": 0.00012667781372467202, "loss": 4.0101, "step": 950 }, { "epoch": 0.47512437810945274, "grad_norm": 7.15625, "learning_rate": 0.00012583996529115762, "loss": 3.9361, "step": 955 }, { "epoch": 0.47761194029850745, "grad_norm": 6.5625, "learning_rate": 0.00012500016857564585, "loss": 4.0114, "step": 960 }, { "epoch": 0.48009950248756217, "grad_norm": 6.875, "learning_rate": 0.00012415848689713903, "loss": 3.9577, "step": 965 }, { "epoch": 0.48258706467661694, "grad_norm": 6.875, "learning_rate": 0.00012331498371676204, "loss": 3.8951, "step": 970 }, { "epoch": 0.48507462686567165, "grad_norm": 7.0, "learning_rate": 0.0001224697226329772, "loss": 3.9695, "step": 975 }, { "epoch": 0.48756218905472637, "grad_norm": 6.09375, "learning_rate": 0.00012162276737678933, "loss": 3.9444, "step": 980 }, { "epoch": 0.4900497512437811, "grad_norm": 6.75, "learning_rate": 0.0001207741818069405, "loss": 3.9877, "step": 985 }, { "epoch": 0.4925373134328358, "grad_norm": 6.875, "learning_rate": 0.00011992402990509515, "loss": 3.9706, "step": 990 }, { "epoch": 0.49502487562189057, "grad_norm": 6.9375, "learning_rate": 0.00011907237577101611, "loss": 3.8701, "step": 995 }, { "epoch": 0.4975124378109453, "grad_norm": 7.46875, "learning_rate": 0.00011821928361773147, "loss": 4.0109, "step": 1000 }, { "epoch": 0.5, "grad_norm": 6.53125, "learning_rate": 0.00011736481776669306, "loss": 3.9716, "step": 1005 }, { "epoch": 0.5024875621890548, "grad_norm": 6.8125, "learning_rate": 0.00011650904264292687, "loss": 3.9534, "step": 1010 }, { "epoch": 0.5049751243781094, "grad_norm": 8.5, "learning_rate": 0.00011565202277017551, "loss": 4.0376, "step": 1015 }, { "epoch": 0.5074626865671642, "grad_norm": 5.75, "learning_rate": 0.000114793822766033, "loss": 3.9223, "step": 1020 }, { "epoch": 0.5099502487562189, "grad_norm": 7.0, "learning_rate": 0.00011393450733707309, "loss": 4.11, "step": 1025 }, { "epoch": 0.5124378109452736, "grad_norm": 6.6875, "learning_rate": 0.00011307414127397027, "loss": 4.0138, "step": 1030 }, { "epoch": 0.5149253731343284, "grad_norm": 7.09375, "learning_rate": 0.00011221278944661473, "loss": 3.8801, "step": 1035 }, { "epoch": 0.5174129353233831, "grad_norm": 7.84375, "learning_rate": 0.00011135051679922141, "loss": 4.0368, "step": 1040 }, { "epoch": 0.5199004975124378, "grad_norm": 6.4375, "learning_rate": 0.00011048738834543319, "loss": 3.8343, "step": 1045 }, { "epoch": 0.5223880597014925, "grad_norm": 7.21875, "learning_rate": 0.00010962346916341903, "loss": 3.885, "step": 1050 }, { "epoch": 0.5248756218905473, "grad_norm": 6.125, "learning_rate": 0.00010875882439096729, "loss": 3.9348, "step": 1055 }, { "epoch": 0.527363184079602, "grad_norm": 6.375, "learning_rate": 0.00010789351922057435, "loss": 3.9439, "step": 1060 }, { "epoch": 0.5298507462686567, "grad_norm": 6.4375, "learning_rate": 0.0001070276188945293, "loss": 3.7975, "step": 1065 }, { "epoch": 0.5323383084577115, "grad_norm": 6.25, "learning_rate": 0.00010616118869999483, "loss": 3.8004, "step": 1070 }, { "epoch": 0.5348258706467661, "grad_norm": 5.875, "learning_rate": 0.00010529429396408452, "loss": 3.967, "step": 1075 }, { "epoch": 0.5373134328358209, "grad_norm": 7.15625, "learning_rate": 0.00010442700004893764, "loss": 3.8504, "step": 1080 }, { "epoch": 0.5398009950248757, "grad_norm": 6.46875, "learning_rate": 0.00010355937234679065, "loss": 3.7783, "step": 1085 }, { "epoch": 0.5422885572139303, "grad_norm": 7.0, "learning_rate": 0.00010269147627504692, "loss": 3.7741, "step": 1090 }, { "epoch": 0.5447761194029851, "grad_norm": 6.3125, "learning_rate": 0.0001018233772713443, "loss": 3.9042, "step": 1095 }, { "epoch": 0.5472636815920398, "grad_norm": 6.0, "learning_rate": 0.00010095514078862147, "loss": 4.0435, "step": 1100 }, { "epoch": 0.5497512437810945, "grad_norm": 6.6875, "learning_rate": 0.00010008683229018256, "loss": 4.0422, "step": 1105 }, { "epoch": 0.5522388059701493, "grad_norm": 7.0, "learning_rate": 9.92185172447616e-05, "loss": 3.9499, "step": 1110 }, { "epoch": 0.554726368159204, "grad_norm": 7.125, "learning_rate": 9.835026112158637e-05, "loss": 3.9851, "step": 1115 }, { "epoch": 0.5572139303482587, "grad_norm": 6.25, "learning_rate": 9.74821293854419e-05, "loss": 3.9625, "step": 1120 }, { "epoch": 0.5597014925373134, "grad_norm": 6.75, "learning_rate": 9.661418749173467e-05, "loss": 3.9269, "step": 1125 }, { "epoch": 0.5621890547263682, "grad_norm": 6.15625, "learning_rate": 9.574650088155752e-05, "loss": 4.0838, "step": 1130 }, { "epoch": 0.5646766169154229, "grad_norm": 7.40625, "learning_rate": 9.487913497675536e-05, "loss": 4.0415, "step": 1135 }, { "epoch": 0.5671641791044776, "grad_norm": 6.59375, "learning_rate": 9.40121551749925e-05, "loss": 4.0316, "step": 1140 }, { "epoch": 0.5696517412935324, "grad_norm": 6.0, "learning_rate": 9.314562684482202e-05, "loss": 4.0425, "step": 1145 }, { "epoch": 0.572139303482587, "grad_norm": 6.25, "learning_rate": 9.227961532075671e-05, "loss": 3.9342, "step": 1150 }, { "epoch": 0.5746268656716418, "grad_norm": 6.65625, "learning_rate": 9.141418589834339e-05, "loss": 4.0811, "step": 1155 }, { "epoch": 0.5771144278606966, "grad_norm": 7.5625, "learning_rate": 9.054940382923953e-05, "loss": 4.0697, "step": 1160 }, { "epoch": 0.5796019900497512, "grad_norm": 7.4375, "learning_rate": 8.96853343162934e-05, "loss": 4.0852, "step": 1165 }, { "epoch": 0.582089552238806, "grad_norm": 5.8125, "learning_rate": 8.882204250862796e-05, "loss": 4.0077, "step": 1170 }, { "epoch": 0.5845771144278606, "grad_norm": 6.65625, "learning_rate": 8.795959349672878e-05, "loss": 3.9226, "step": 1175 }, { "epoch": 0.5870646766169154, "grad_norm": 6.78125, "learning_rate": 8.709805230753627e-05, "loss": 4.0092, "step": 1180 }, { "epoch": 0.5895522388059702, "grad_norm": 6.4375, "learning_rate": 8.623748389954283e-05, "loss": 3.9131, "step": 1185 }, { "epoch": 0.5920398009950248, "grad_norm": 6.375, "learning_rate": 8.537795315789509e-05, "loss": 3.857, "step": 1190 }, { "epoch": 0.5945273631840796, "grad_norm": 5.8125, "learning_rate": 8.451952488950166e-05, "loss": 3.9707, "step": 1195 }, { "epoch": 0.5970149253731343, "grad_norm": 7.15625, "learning_rate": 8.366226381814697e-05, "loss": 3.9853, "step": 1200 }, { "epoch": 0.599502487562189, "grad_norm": 5.9375, "learning_rate": 8.280623457961102e-05, "loss": 4.107, "step": 1205 }, { "epoch": 0.6019900497512438, "grad_norm": 6.4375, "learning_rate": 8.195150171679608e-05, "loss": 3.7088, "step": 1210 }, { "epoch": 0.6044776119402985, "grad_norm": 6.15625, "learning_rate": 8.109812967486025e-05, "loss": 3.9205, "step": 1215 }, { "epoch": 0.6069651741293532, "grad_norm": 7.1875, "learning_rate": 8.02461827963585e-05, "loss": 3.9086, "step": 1220 }, { "epoch": 0.6094527363184079, "grad_norm": 6.3125, "learning_rate": 7.939572531639128e-05, "loss": 3.9716, "step": 1225 }, { "epoch": 0.6119402985074627, "grad_norm": 6.40625, "learning_rate": 7.854682135776131e-05, "loss": 3.9194, "step": 1230 }, { "epoch": 0.6144278606965174, "grad_norm": 6.875, "learning_rate": 7.769953492613899e-05, "loss": 3.8653, "step": 1235 }, { "epoch": 0.6169154228855721, "grad_norm": 6.625, "learning_rate": 7.685392990523626e-05, "loss": 4.043, "step": 1240 }, { "epoch": 0.6194029850746269, "grad_norm": 6.6875, "learning_rate": 7.601007005199021e-05, "loss": 3.829, "step": 1245 }, { "epoch": 0.6218905472636815, "grad_norm": 6.53125, "learning_rate": 7.516801899175565e-05, "loss": 3.9138, "step": 1250 }, { "epoch": 0.6243781094527363, "grad_norm": 6.5, "learning_rate": 7.432784021350796e-05, "loss": 3.9103, "step": 1255 }, { "epoch": 0.6268656716417911, "grad_norm": 6.15625, "learning_rate": 7.348959706505626e-05, "loss": 3.9792, "step": 1260 }, { "epoch": 0.6293532338308457, "grad_norm": 6.28125, "learning_rate": 7.265335274826704e-05, "loss": 4.0775, "step": 1265 }, { "epoch": 0.6318407960199005, "grad_norm": 6.59375, "learning_rate": 7.181917031429874e-05, "loss": 4.0234, "step": 1270 }, { "epoch": 0.6343283582089553, "grad_norm": 7.03125, "learning_rate": 7.09871126588481e-05, "loss": 3.9329, "step": 1275 }, { "epoch": 0.6368159203980099, "grad_norm": 6.0625, "learning_rate": 7.015724251740766e-05, "loss": 3.6704, "step": 1280 }, { "epoch": 0.6393034825870647, "grad_norm": 6.71875, "learning_rate": 6.932962246053577e-05, "loss": 3.8563, "step": 1285 }, { "epoch": 0.6417910447761194, "grad_norm": 5.84375, "learning_rate": 6.850431488913895e-05, "loss": 3.8506, "step": 1290 }, { "epoch": 0.6442786069651741, "grad_norm": 6.8125, "learning_rate": 6.76813820297669e-05, "loss": 4.008, "step": 1295 }, { "epoch": 0.6467661691542289, "grad_norm": 7.09375, "learning_rate": 6.686088592992067e-05, "loss": 4.0959, "step": 1300 }, { "epoch": 0.6492537313432836, "grad_norm": 6.375, "learning_rate": 6.604288845337453e-05, "loss": 4.0365, "step": 1305 }, { "epoch": 0.6517412935323383, "grad_norm": 6.96875, "learning_rate": 6.522745127551158e-05, "loss": 3.8927, "step": 1310 }, { "epoch": 0.654228855721393, "grad_norm": 6.96875, "learning_rate": 6.44146358786734e-05, "loss": 3.9165, "step": 1315 }, { "epoch": 0.6567164179104478, "grad_norm": 6.65625, "learning_rate": 6.360450354752458e-05, "loss": 4.1257, "step": 1320 }, { "epoch": 0.6592039800995025, "grad_norm": 7.125, "learning_rate": 6.279711536443185e-05, "loss": 3.9571, "step": 1325 }, { "epoch": 0.6616915422885572, "grad_norm": 6.125, "learning_rate": 6.199253220485856e-05, "loss": 3.7978, "step": 1330 }, { "epoch": 0.664179104477612, "grad_norm": 6.59375, "learning_rate": 6.119081473277501e-05, "loss": 3.859, "step": 1335 }, { "epoch": 0.6666666666666666, "grad_norm": 6.1875, "learning_rate": 6.039202339608432e-05, "loss": 4.015, "step": 1340 }, { "epoch": 0.6691542288557214, "grad_norm": 6.625, "learning_rate": 5.959621842206474e-05, "loss": 4.0804, "step": 1345 }, { "epoch": 0.6716417910447762, "grad_norm": 6.625, "learning_rate": 5.880345981282876e-05, "loss": 4.0607, "step": 1350 }, { "epoch": 0.6741293532338308, "grad_norm": 6.46875, "learning_rate": 5.801380734079907e-05, "loss": 3.8616, "step": 1355 }, { "epoch": 0.6766169154228856, "grad_norm": 6.375, "learning_rate": 5.722732054420172e-05, "loss": 3.8968, "step": 1360 }, { "epoch": 0.6791044776119403, "grad_norm": 6.71875, "learning_rate": 5.6444058722577165e-05, "loss": 4.0431, "step": 1365 }, { "epoch": 0.681592039800995, "grad_norm": 6.65625, "learning_rate": 5.566408093230911e-05, "loss": 3.9798, "step": 1370 }, { "epoch": 0.6840796019900498, "grad_norm": 5.375, "learning_rate": 5.4887445982171906e-05, "loss": 3.7958, "step": 1375 }, { "epoch": 0.6865671641791045, "grad_norm": 5.96875, "learning_rate": 5.4114212428896424e-05, "loss": 3.9962, "step": 1380 }, { "epoch": 0.6890547263681592, "grad_norm": 6.0625, "learning_rate": 5.334443857275487e-05, "loss": 4.009, "step": 1385 }, { "epoch": 0.6915422885572139, "grad_norm": 6.65625, "learning_rate": 5.257818245316522e-05, "loss": 3.9681, "step": 1390 }, { "epoch": 0.6940298507462687, "grad_norm": 7.21875, "learning_rate": 5.1815501844315105e-05, "loss": 4.0784, "step": 1395 }, { "epoch": 0.6965174129353234, "grad_norm": 7.375, "learning_rate": 5.105645425080572e-05, "loss": 4.0183, "step": 1400 }, { "epoch": 0.6990049751243781, "grad_norm": 6.0625, "learning_rate": 5.030109690331625e-05, "loss": 3.9356, "step": 1405 }, { "epoch": 0.7014925373134329, "grad_norm": 7.625, "learning_rate": 4.954948675428853e-05, "loss": 3.7845, "step": 1410 }, { "epoch": 0.7039800995024875, "grad_norm": 6.1875, "learning_rate": 4.880168047363312e-05, "loss": 3.7763, "step": 1415 }, { "epoch": 0.7064676616915423, "grad_norm": 6.46875, "learning_rate": 4.8057734444456536e-05, "loss": 4.0405, "step": 1420 }, { "epoch": 0.7089552238805971, "grad_norm": 6.09375, "learning_rate": 4.7317704758809946e-05, "loss": 3.9666, "step": 1425 }, { "epoch": 0.7114427860696517, "grad_norm": 7.0, "learning_rate": 4.658164721345998e-05, "loss": 3.9511, "step": 1430 }, { "epoch": 0.7139303482587065, "grad_norm": 7.03125, "learning_rate": 4.584961730568188e-05, "loss": 4.0864, "step": 1435 }, { "epoch": 0.7164179104477612, "grad_norm": 6.625, "learning_rate": 4.512167022907494e-05, "loss": 4.0077, "step": 1440 }, { "epoch": 0.7189054726368159, "grad_norm": 6.53125, "learning_rate": 4.439786086940115e-05, "loss": 3.8572, "step": 1445 }, { "epoch": 0.7213930348258707, "grad_norm": 6.5625, "learning_rate": 4.3678243800446835e-05, "loss": 3.812, "step": 1450 }, { "epoch": 0.7238805970149254, "grad_norm": 5.375, "learning_rate": 4.296287327990797e-05, "loss": 3.816, "step": 1455 }, { "epoch": 0.7263681592039801, "grad_norm": 6.03125, "learning_rate": 4.225180324529917e-05, "loss": 3.8844, "step": 1460 }, { "epoch": 0.7288557213930348, "grad_norm": 7.46875, "learning_rate": 4.1545087309887045e-05, "loss": 3.9426, "step": 1465 }, { "epoch": 0.7313432835820896, "grad_norm": 6.0, "learning_rate": 4.084277875864776e-05, "loss": 3.9788, "step": 1470 }, { "epoch": 0.7338308457711443, "grad_norm": 6.34375, "learning_rate": 4.014493054424944e-05, "loss": 4.0493, "step": 1475 }, { "epoch": 0.736318407960199, "grad_norm": 6.78125, "learning_rate": 3.945159528305971e-05, "loss": 4.0197, "step": 1480 }, { "epoch": 0.7388059701492538, "grad_norm": 6.8125, "learning_rate": 3.876282525117847e-05, "loss": 3.9014, "step": 1485 }, { "epoch": 0.7412935323383084, "grad_norm": 7.15625, "learning_rate": 3.807867238049642e-05, "loss": 3.987, "step": 1490 }, { "epoch": 0.7437810945273632, "grad_norm": 6.0, "learning_rate": 3.739918825477953e-05, "loss": 3.9318, "step": 1495 }, { "epoch": 0.746268656716418, "grad_norm": 5.5625, "learning_rate": 3.672442410577965e-05, "loss": 3.8518, "step": 1500 }, { "epoch": 0.7487562189054726, "grad_norm": 5.59375, "learning_rate": 3.605443080937172e-05, "loss": 3.7997, "step": 1505 }, { "epoch": 0.7512437810945274, "grad_norm": 6.09375, "learning_rate": 3.5389258881718e-05, "loss": 3.9, "step": 1510 }, { "epoch": 0.753731343283582, "grad_norm": 6.4375, "learning_rate": 3.472895847545905e-05, "loss": 4.005, "step": 1515 }, { "epoch": 0.7562189054726368, "grad_norm": 7.375, "learning_rate": 3.407357937593237e-05, "loss": 3.9962, "step": 1520 }, { "epoch": 0.7587064676616916, "grad_norm": 6.5, "learning_rate": 3.342317099741886e-05, "loss": 3.9809, "step": 1525 }, { "epoch": 0.7611940298507462, "grad_norm": 6.875, "learning_rate": 3.27777823794168e-05, "loss": 3.9891, "step": 1530 }, { "epoch": 0.763681592039801, "grad_norm": 6.84375, "learning_rate": 3.213746218294455e-05, "loss": 4.0958, "step": 1535 }, { "epoch": 0.7661691542288557, "grad_norm": 6.1875, "learning_rate": 3.150225868687161e-05, "loss": 3.838, "step": 1540 }, { "epoch": 0.7686567164179104, "grad_norm": 7.46875, "learning_rate": 3.0872219784278354e-05, "loss": 3.9027, "step": 1545 }, { "epoch": 0.7711442786069652, "grad_norm": 5.78125, "learning_rate": 3.02473929788452e-05, "loss": 3.9055, "step": 1550 }, { "epoch": 0.7736318407960199, "grad_norm": 6.0, "learning_rate": 2.96278253812707e-05, "loss": 3.9548, "step": 1555 }, { "epoch": 0.7761194029850746, "grad_norm": 6.71875, "learning_rate": 2.901356370571967e-05, "loss": 3.8413, "step": 1560 }, { "epoch": 0.7786069651741293, "grad_norm": 6.625, "learning_rate": 2.840465426630091e-05, "loss": 4.1502, "step": 1565 }, { "epoch": 0.7810945273631841, "grad_norm": 6.53125, "learning_rate": 2.7801142973575243e-05, "loss": 3.851, "step": 1570 }, { "epoch": 0.7835820895522388, "grad_norm": 6.125, "learning_rate": 2.7203075331094017e-05, "loss": 4.0059, "step": 1575 }, { "epoch": 0.7860696517412935, "grad_norm": 7.0, "learning_rate": 2.6610496431968125e-05, "loss": 3.8795, "step": 1580 }, { "epoch": 0.7885572139303483, "grad_norm": 6.53125, "learning_rate": 2.6023450955468176e-05, "loss": 3.8933, "step": 1585 }, { "epoch": 0.7910447761194029, "grad_norm": 5.96875, "learning_rate": 2.54419831636557e-05, "loss": 4.1032, "step": 1590 }, { "epoch": 0.7935323383084577, "grad_norm": 5.8125, "learning_rate": 2.4866136898045843e-05, "loss": 3.8866, "step": 1595 }, { "epoch": 0.7960199004975125, "grad_norm": 6.375, "learning_rate": 2.4295955576301965e-05, "loss": 4.0359, "step": 1600 }, { "epoch": 0.7985074626865671, "grad_norm": 6.75, "learning_rate": 2.3731482188961818e-05, "loss": 3.8639, "step": 1605 }, { "epoch": 0.8009950248756219, "grad_norm": 6.78125, "learning_rate": 2.317275929619627e-05, "loss": 4.0732, "step": 1610 }, { "epoch": 0.8034825870646766, "grad_norm": 7.625, "learning_rate": 2.261982902460039e-05, "loss": 3.9888, "step": 1615 }, { "epoch": 0.8059701492537313, "grad_norm": 6.0, "learning_rate": 2.2072733064017103e-05, "loss": 4.0829, "step": 1620 }, { "epoch": 0.8084577114427861, "grad_norm": 8.0625, "learning_rate": 2.1531512664393838e-05, "loss": 4.1679, "step": 1625 }, { "epoch": 0.8109452736318408, "grad_norm": 7.0, "learning_rate": 2.0996208632672475e-05, "loss": 4.0939, "step": 1630 }, { "epoch": 0.8134328358208955, "grad_norm": 6.8125, "learning_rate": 2.0466861329712473e-05, "loss": 3.8609, "step": 1635 }, { "epoch": 0.8159203980099502, "grad_norm": 6.71875, "learning_rate": 1.9943510667247813e-05, "loss": 4.083, "step": 1640 }, { "epoch": 0.818407960199005, "grad_norm": 6.1875, "learning_rate": 1.9426196104877735e-05, "loss": 3.9825, "step": 1645 }, { "epoch": 0.8208955223880597, "grad_norm": 6.96875, "learning_rate": 1.89149566470915e-05, "loss": 3.9559, "step": 1650 }, { "epoch": 0.8233830845771144, "grad_norm": 6.59375, "learning_rate": 1.8409830840327546e-05, "loss": 4.0314, "step": 1655 }, { "epoch": 0.8258706467661692, "grad_norm": 6.1875, "learning_rate": 1.791085677006722e-05, "loss": 3.8751, "step": 1660 }, { "epoch": 0.8283582089552238, "grad_norm": 7.25, "learning_rate": 1.741807205796314e-05, "loss": 4.0051, "step": 1665 }, { "epoch": 0.8308457711442786, "grad_norm": 5.59375, "learning_rate": 1.6931513859002635e-05, "loss": 3.9194, "step": 1670 }, { "epoch": 0.8333333333333334, "grad_norm": 6.34375, "learning_rate": 1.6451218858706374e-05, "loss": 3.8528, "step": 1675 }, { "epoch": 0.835820895522388, "grad_norm": 6.15625, "learning_rate": 1.5977223270362196e-05, "loss": 4.0617, "step": 1680 }, { "epoch": 0.8383084577114428, "grad_norm": 7.125, "learning_rate": 1.5509562832294944e-05, "loss": 3.7389, "step": 1685 }, { "epoch": 0.8407960199004975, "grad_norm": 6.25, "learning_rate": 1.5048272805171615e-05, "loss": 3.9292, "step": 1690 }, { "epoch": 0.8432835820895522, "grad_norm": 5.90625, "learning_rate": 1.459338796934293e-05, "loss": 4.011, "step": 1695 }, { "epoch": 0.845771144278607, "grad_norm": 6.53125, "learning_rate": 1.4144942622220902e-05, "loss": 3.8728, "step": 1700 }, { "epoch": 0.8482587064676617, "grad_norm": 6.71875, "learning_rate": 1.3702970575692975e-05, "loss": 4.0874, "step": 1705 }, { "epoch": 0.8507462686567164, "grad_norm": 6.9375, "learning_rate": 1.3267505153572501e-05, "loss": 4.0708, "step": 1710 }, { "epoch": 0.8532338308457711, "grad_norm": 5.03125, "learning_rate": 1.2838579189086353e-05, "loss": 3.8598, "step": 1715 }, { "epoch": 0.8557213930348259, "grad_norm": 7.46875, "learning_rate": 1.2416225022399286e-05, "loss": 4.0435, "step": 1720 }, { "epoch": 0.8582089552238806, "grad_norm": 5.96875, "learning_rate": 1.2000474498175552e-05, "loss": 4.054, "step": 1725 }, { "epoch": 0.8606965174129353, "grad_norm": 6.8125, "learning_rate": 1.1591358963177923e-05, "loss": 3.8522, "step": 1730 }, { "epoch": 0.8631840796019901, "grad_norm": 5.34375, "learning_rate": 1.118890926390419e-05, "loss": 3.9089, "step": 1735 }, { "epoch": 0.8656716417910447, "grad_norm": 6.8125, "learning_rate": 1.0793155744261351e-05, "loss": 4.0584, "step": 1740 }, { "epoch": 0.8681592039800995, "grad_norm": 6.25, "learning_rate": 1.0404128243277777e-05, "loss": 3.9094, "step": 1745 }, { "epoch": 0.8706467661691543, "grad_norm": 6.75, "learning_rate": 1.0021856092853432e-05, "loss": 3.9982, "step": 1750 }, { "epoch": 0.8731343283582089, "grad_norm": 6.25, "learning_rate": 9.646368115548232e-06, "loss": 3.946, "step": 1755 }, { "epoch": 0.8756218905472637, "grad_norm": 6.71875, "learning_rate": 9.277692622409018e-06, "loss": 3.8958, "step": 1760 }, { "epoch": 0.8781094527363185, "grad_norm": 6.375, "learning_rate": 8.915857410834794e-06, "loss": 3.7367, "step": 1765 }, { "epoch": 0.8805970149253731, "grad_norm": 6.28125, "learning_rate": 8.56088976248095e-06, "loss": 3.9724, "step": 1770 }, { "epoch": 0.8830845771144279, "grad_norm": 6.125, "learning_rate": 8.212816441202309e-06, "loss": 4.0212, "step": 1775 }, { "epoch": 0.8855721393034826, "grad_norm": 6.90625, "learning_rate": 7.871663691035103e-06, "loss": 3.6865, "step": 1780 }, { "epoch": 0.8880597014925373, "grad_norm": 6.78125, "learning_rate": 7.53745723421827e-06, "loss": 3.9914, "step": 1785 }, { "epoch": 0.8905472636815921, "grad_norm": 5.4375, "learning_rate": 7.2102222692540415e-06, "loss": 3.9573, "step": 1790 }, { "epoch": 0.8930348258706468, "grad_norm": 6.78125, "learning_rate": 6.889983469008055e-06, "loss": 4.1287, "step": 1795 }, { "epoch": 0.8955223880597015, "grad_norm": 5.84375, "learning_rate": 6.576764978849004e-06, "loss": 4.1117, "step": 1800 }, { "epoch": 0.8980099502487562, "grad_norm": 6.8125, "learning_rate": 6.27059041482817e-06, "loss": 3.8274, "step": 1805 }, { "epoch": 0.900497512437811, "grad_norm": 6.4375, "learning_rate": 5.971482861898836e-06, "loss": 3.8697, "step": 1810 }, { "epoch": 0.9029850746268657, "grad_norm": 6.03125, "learning_rate": 5.679464872175666e-06, "loss": 3.9326, "step": 1815 }, { "epoch": 0.9054726368159204, "grad_norm": 6.5, "learning_rate": 5.394558463234378e-06, "loss": 3.8915, "step": 1820 }, { "epoch": 0.9079601990049752, "grad_norm": 5.9375, "learning_rate": 5.116785116451661e-06, "loss": 3.9306, "step": 1825 }, { "epoch": 0.9104477611940298, "grad_norm": 5.96875, "learning_rate": 4.846165775385459e-06, "loss": 3.901, "step": 1830 }, { "epoch": 0.9129353233830846, "grad_norm": 6.28125, "learning_rate": 4.5827208441959424e-06, "loss": 3.9952, "step": 1835 }, { "epoch": 0.9154228855721394, "grad_norm": 5.9375, "learning_rate": 4.3264701861070345e-06, "loss": 4.0501, "step": 1840 }, { "epoch": 0.917910447761194, "grad_norm": 5.90625, "learning_rate": 4.077433121908747e-06, "loss": 3.7784, "step": 1845 }, { "epoch": 0.9203980099502488, "grad_norm": 6.8125, "learning_rate": 3.835628428500515e-06, "loss": 3.928, "step": 1850 }, { "epoch": 0.9228855721393034, "grad_norm": 6.53125, "learning_rate": 3.601074337475352e-06, "loss": 3.9705, "step": 1855 }, { "epoch": 0.9253731343283582, "grad_norm": 7.5, "learning_rate": 3.3737885337452814e-06, "loss": 4.0769, "step": 1860 }, { "epoch": 0.927860696517413, "grad_norm": 6.15625, "learning_rate": 3.153788154207926e-06, "loss": 3.9098, "step": 1865 }, { "epoch": 0.9303482587064676, "grad_norm": 6.875, "learning_rate": 2.9410897864544206e-06, "loss": 4.037, "step": 1870 }, { "epoch": 0.9328358208955224, "grad_norm": 6.21875, "learning_rate": 2.735709467518699e-06, "loss": 3.9169, "step": 1875 }, { "epoch": 0.9353233830845771, "grad_norm": 6.5, "learning_rate": 2.5376626826683956e-06, "loss": 3.9237, "step": 1880 }, { "epoch": 0.9378109452736318, "grad_norm": 6.375, "learning_rate": 2.3469643642372586e-06, "loss": 3.9737, "step": 1885 }, { "epoch": 0.9402985074626866, "grad_norm": 7.15625, "learning_rate": 2.1636288904992585e-06, "loss": 4.0875, "step": 1890 }, { "epoch": 0.9427860696517413, "grad_norm": 6.25, "learning_rate": 1.9876700845845475e-06, "loss": 3.8926, "step": 1895 }, { "epoch": 0.945273631840796, "grad_norm": 6.03125, "learning_rate": 1.8191012134371577e-06, "loss": 3.8843, "step": 1900 }, { "epoch": 0.9477611940298507, "grad_norm": 6.59375, "learning_rate": 1.6579349868147687e-06, "loss": 3.9296, "step": 1905 }, { "epoch": 0.9502487562189055, "grad_norm": 6.9375, "learning_rate": 1.5041835563303742e-06, "loss": 3.8406, "step": 1910 }, { "epoch": 0.9527363184079602, "grad_norm": 6.78125, "learning_rate": 1.3578585145360812e-06, "loss": 4.1326, "step": 1915 }, { "epoch": 0.9552238805970149, "grad_norm": 6.21875, "learning_rate": 1.2189708940490652e-06, "loss": 3.8935, "step": 1920 }, { "epoch": 0.9577114427860697, "grad_norm": 5.6875, "learning_rate": 1.0875311667196908e-06, "loss": 3.7818, "step": 1925 }, { "epoch": 0.9601990049751243, "grad_norm": 5.15625, "learning_rate": 9.635492428420434e-07, "loss": 3.6428, "step": 1930 }, { "epoch": 0.9626865671641791, "grad_norm": 6.4375, "learning_rate": 8.470344704066046e-07, "loss": 3.8555, "step": 1935 }, { "epoch": 0.9651741293532339, "grad_norm": 6.1875, "learning_rate": 7.379956343955386e-07, "loss": 3.8856, "step": 1940 }, { "epoch": 0.9676616915422885, "grad_norm": 6.0, "learning_rate": 6.364409561202323e-07, "loss": 4.0294, "step": 1945 }, { "epoch": 0.9701492537313433, "grad_norm": 7.15625, "learning_rate": 5.42378092601481e-07, "loss": 3.9593, "step": 1950 }, { "epoch": 0.972636815920398, "grad_norm": 6.25, "learning_rate": 4.558141359921386e-07, "loss": 3.9914, "step": 1955 }, { "epoch": 0.9751243781094527, "grad_norm": 8.375, "learning_rate": 3.7675561304238994e-07, "loss": 3.9707, "step": 1960 }, { "epoch": 0.9776119402985075, "grad_norm": 5.65625, "learning_rate": 3.0520848460765527e-07, "loss": 3.7689, "step": 1965 }, { "epoch": 0.9800995024875622, "grad_norm": 6.6875, "learning_rate": 2.4117814519911684e-07, "loss": 3.9225, "step": 1970 }, { "epoch": 0.9825870646766169, "grad_norm": 6.125, "learning_rate": 1.846694225770551e-07, "loss": 3.9233, "step": 1975 }, { "epoch": 0.9850746268656716, "grad_norm": 6.0625, "learning_rate": 1.3568657738678435e-07, "loss": 3.8331, "step": 1980 }, { "epoch": 0.9875621890547264, "grad_norm": 6.40625, "learning_rate": 9.423330283742093e-08, "loss": 3.993, "step": 1985 }, { "epoch": 0.9900497512437811, "grad_norm": 6.4375, "learning_rate": 6.031272442341696e-08, "loss": 3.8852, "step": 1990 }, { "epoch": 0.9925373134328358, "grad_norm": 6.75, "learning_rate": 3.392739968894887e-08, "loss": 3.821, "step": 1995 }, { "epoch": 0.9950248756218906, "grad_norm": 6.59375, "learning_rate": 1.5079318035016164e-08, "loss": 3.975, "step": 2000 }, { "epoch": 0.9975124378109452, "grad_norm": 5.46875, "learning_rate": 3.769900569505769e-09, "loss": 4.0483, "step": 2005 }, { "epoch": 1.0, "grad_norm": 8.5625, "learning_rate": 0.0, "loss": 3.9369, "step": 2010 }, { "epoch": 1.0, "step": 2010, "total_flos": 1275064289820672.0, "train_loss": 4.025861802029966, "train_runtime": 256.6461, "train_samples_per_second": 125.278, "train_steps_per_second": 7.832 } ], "logging_steps": 5, "max_steps": 2010, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1275064289820672.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }