diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.09342927922614132, + "epoch": 0.007431874483897605, "eval_steps": 200, - "global_step": 1792, + "global_step": 2263, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -12953,1545 +12953,5367 @@ "step": 1600 }, { - "epoch": 0.07089772325115017, - "grad_norm": 1.0, + "epoch": 0.00011796626164916834, + "grad_norm": 0.69921875, "learning_rate": 7.585101150577211e-05, - "loss": 1.6539, + "loss": 1.6086, "num_input_tokens_seen": 1888616448, "step": 1601 }, { - "epoch": 0.07101568951279934, - "grad_norm": 0.8125, + "epoch": 0.0002359325232983367, + "grad_norm": 0.640625, "learning_rate": 7.584408903418355e-05, - "loss": 1.7915, + "loss": 1.4421, "num_input_tokens_seen": 1889796096, "step": 1602 }, { - "epoch": 0.07113365577444851, - "grad_norm": 1.015625, + "epoch": 0.000353898784947505, + "grad_norm": 0.69140625, "learning_rate": 7.583716110885992e-05, - "loss": 1.4891, + "loss": 1.3924, "num_input_tokens_seen": 1890975744, "step": 1603 }, { - "epoch": 0.07125162203609768, - "grad_norm": 0.8359375, + "epoch": 0.0004718650465966734, + "grad_norm": 0.83203125, "learning_rate": 7.583022773085532e-05, - "loss": 1.6747, + "loss": 1.4947, "num_input_tokens_seen": 1892155392, "step": 1604 }, { - "epoch": 0.07136958829774684, - "grad_norm": 0.9765625, + "epoch": 0.0005898313082458417, + "grad_norm": 0.79296875, "learning_rate": 7.582328890122466e-05, - "loss": 1.5514, + "loss": 1.5896, "num_input_tokens_seen": 1893335040, "step": 1605 }, { - "epoch": 0.07148755455939601, - "grad_norm": 0.73046875, + "epoch": 0.00070779756989501, + "grad_norm": 0.73828125, "learning_rate": 7.581634462102373e-05, - "loss": 1.6703, + "loss": 1.4465, "num_input_tokens_seen": 1894514688, "step": 1606 }, { - "epoch": 0.07160552082104518, - "grad_norm": 0.84765625, + "epoch": 0.0008257638315441783, + "grad_norm": 0.78515625, "learning_rate": 7.580939489130906e-05, - "loss": 1.5847, + "loss": 1.48, "num_input_tokens_seen": 1895694336, "step": 1607 }, { - "epoch": 0.07172348708269435, - "grad_norm": 0.78125, + "epoch": 0.0009437300931933467, + "grad_norm": 0.7109375, "learning_rate": 7.580243971313811e-05, - "loss": 1.5541, + "loss": 1.5052, "num_input_tokens_seen": 1896873984, "step": 1608 }, { - "epoch": 0.07184145334434351, - "grad_norm": 0.81640625, + "epoch": 0.001061696354842515, + "grad_norm": 0.8046875, "learning_rate": 7.579547908756911e-05, - "loss": 1.6264, + "loss": 1.5373, "num_input_tokens_seen": 1898053632, "step": 1609 }, { - "epoch": 0.07195941960599268, - "grad_norm": 0.7265625, + "epoch": 0.0011796626164916834, + "grad_norm": 0.828125, "learning_rate": 7.578851301566112e-05, - "loss": 1.6484, + "loss": 1.4559, "num_input_tokens_seen": 1899233280, "step": 1610 }, { - "epoch": 0.07207738586764185, - "grad_norm": 0.83203125, + "epoch": 0.0012976288781408518, + "grad_norm": 1.015625, "learning_rate": 7.578154149847404e-05, - "loss": 1.5452, + "loss": 1.5937, "num_input_tokens_seen": 1900412928, "step": 1611 }, { - "epoch": 0.07219535212929103, - "grad_norm": 0.73828125, + "epoch": 0.00141559513979002, + "grad_norm": 0.85546875, "learning_rate": 7.57745645370686e-05, - "loss": 1.6889, + "loss": 1.6112, "num_input_tokens_seen": 1901592576, "step": 1612 }, { - "epoch": 0.07231331839094018, - "grad_norm": 1.015625, + "epoch": 0.0015335614014391884, + "grad_norm": 0.83203125, "learning_rate": 7.576758213250638e-05, - "loss": 1.5947, + "loss": 1.4209, "num_input_tokens_seen": 1902772224, "step": 1613 }, { - "epoch": 0.07243128465258936, - "grad_norm": 0.74609375, + "epoch": 0.0016515276630883566, + "grad_norm": 0.94921875, "learning_rate": 7.576059428584972e-05, - "loss": 1.823, + "loss": 1.6171, "num_input_tokens_seen": 1903951872, "step": 1614 }, { - "epoch": 0.07254925091423853, - "grad_norm": 0.91015625, + "epoch": 0.001769493924737525, + "grad_norm": 0.9453125, "learning_rate": 7.575360099816185e-05, - "loss": 1.7295, + "loss": 1.4181, "num_input_tokens_seen": 1905131520, "step": 1615 }, { - "epoch": 0.0726672171758877, - "grad_norm": 0.703125, + "epoch": 0.0018874601863866935, + "grad_norm": 1.0078125, "learning_rate": 7.574660227050681e-05, - "loss": 1.687, + "loss": 1.529, "num_input_tokens_seen": 1906311168, "step": 1616 }, { - "epoch": 0.07278518343753687, - "grad_norm": 1.015625, + "epoch": 0.0020054264480358617, + "grad_norm": 0.91015625, "learning_rate": 7.573959810394948e-05, - "loss": 1.6457, + "loss": 1.6515, "num_input_tokens_seen": 1907490816, "step": 1617 }, { - "epoch": 0.07290314969918603, - "grad_norm": 0.87109375, + "epoch": 0.00212339270968503, + "grad_norm": 1.015625, "learning_rate": 7.573258849955555e-05, - "loss": 1.6067, + "loss": 1.6023, "num_input_tokens_seen": 1908670464, "step": 1618 }, { - "epoch": 0.0730211159608352, - "grad_norm": 0.8515625, + "epoch": 0.0022413589713341986, + "grad_norm": 0.890625, "learning_rate": 7.572557345839153e-05, - "loss": 1.7924, + "loss": 1.4418, "num_input_tokens_seen": 1909850112, "step": 1619 }, { - "epoch": 0.07313908222248437, - "grad_norm": 0.7109375, + "epoch": 0.0023593252329833668, + "grad_norm": 1.0703125, "learning_rate": 7.571855298152477e-05, - "loss": 1.7058, + "loss": 1.5494, "num_input_tokens_seen": 1911029760, "step": 1620 }, { - "epoch": 0.07325704848413354, - "grad_norm": 0.828125, + "epoch": 0.002477291494632535, + "grad_norm": 0.84375, "learning_rate": 7.571152707002347e-05, - "loss": 1.6567, + "loss": 1.5324, "num_input_tokens_seen": 1912209408, "step": 1621 }, { - "epoch": 0.0733750147457827, - "grad_norm": 0.765625, + "epoch": 0.0025952577562817036, + "grad_norm": 1.0078125, "learning_rate": 7.57044957249566e-05, - "loss": 1.5657, + "loss": 1.5491, "num_input_tokens_seen": 1913389056, "step": 1622 }, { - "epoch": 0.07349298100743187, - "grad_norm": 0.70703125, + "epoch": 0.002713224017930872, + "grad_norm": 0.81640625, "learning_rate": 7.569745894739402e-05, - "loss": 1.7528, + "loss": 1.5509, "num_input_tokens_seen": 1914568704, "step": 1623 }, { - "epoch": 0.07361094726908105, - "grad_norm": 0.75390625, + "epoch": 0.00283119027958004, + "grad_norm": 0.97265625, "learning_rate": 7.569041673840637e-05, - "loss": 1.752, + "loss": 1.5173, "num_input_tokens_seen": 1915748352, "step": 1624 }, { - "epoch": 0.07372891353073022, - "grad_norm": 0.75, + "epoch": 0.0029491565412292082, + "grad_norm": 0.69921875, "learning_rate": 7.568336909906514e-05, - "loss": 1.6214, + "loss": 1.6629, "num_input_tokens_seen": 1916928000, "step": 1625 }, { - "epoch": 0.07384687979237937, - "grad_norm": 0.83984375, + "epoch": 0.003067122802878377, + "grad_norm": 0.91015625, "learning_rate": 7.567631603044264e-05, - "loss": 1.6274, + "loss": 1.5456, "num_input_tokens_seen": 1918107648, "step": 1626 }, { - "epoch": 0.07396484605402855, - "grad_norm": 0.89453125, + "epoch": 0.003185089064527545, + "grad_norm": 0.87890625, "learning_rate": 7.5669257533612e-05, - "loss": 1.6394, + "loss": 1.4868, "num_input_tokens_seen": 1919287296, "step": 1627 }, { - "epoch": 0.07408281231567772, - "grad_norm": 0.796875, + "epoch": 0.0033030553261767133, + "grad_norm": 0.8671875, "learning_rate": 7.566219360964719e-05, - "loss": 1.6565, + "loss": 1.475, "num_input_tokens_seen": 1920466944, "step": 1628 }, { - "epoch": 0.07420077857732689, - "grad_norm": 0.796875, + "epoch": 0.003421021587825882, + "grad_norm": 0.84375, "learning_rate": 7.565512425962298e-05, - "loss": 1.6811, + "loss": 1.4149, "num_input_tokens_seen": 1921646592, "step": 1629 }, { - "epoch": 0.07431874483897605, - "grad_norm": 0.83984375, + "epoch": 0.00353898784947505, + "grad_norm": 0.84375, "learning_rate": 7.5648049484615e-05, - "loss": 1.891, + "loss": 1.4973, "num_input_tokens_seen": 1922826240, "step": 1630 }, { - "epoch": 0.07443671110062522, - "grad_norm": 0.95703125, + "epoch": 0.0036569541111242184, + "grad_norm": 0.859375, "learning_rate": 7.564096928569969e-05, - "loss": 1.5795, + "loss": 1.5576, "num_input_tokens_seen": 1924005888, "step": 1631 }, { - "epoch": 0.07455467736227439, - "grad_norm": 0.76171875, + "epoch": 0.003774920372773387, + "grad_norm": 0.83984375, "learning_rate": 7.56338836639543e-05, - "loss": 1.8111, + "loss": 1.5025, "num_input_tokens_seen": 1925185536, "step": 1632 }, { - "epoch": 0.07467264362392356, - "grad_norm": 0.8359375, + "epoch": 0.003892886634422555, + "grad_norm": 0.87109375, "learning_rate": 7.562679262045692e-05, - "loss": 1.5859, + "loss": 1.5112, "num_input_tokens_seen": 1926365184, "step": 1633 }, { - "epoch": 0.07479060988557272, - "grad_norm": 0.81640625, + "epoch": 0.004010852896071723, + "grad_norm": 0.6875, "learning_rate": 7.561969615628649e-05, - "loss": 1.6349, + "loss": 1.6745, "num_input_tokens_seen": 1927544832, "step": 1634 }, { - "epoch": 0.07490857614722189, - "grad_norm": 0.67578125, + "epoch": 0.004128819157720892, + "grad_norm": 0.85546875, "learning_rate": 7.56125942725227e-05, - "loss": 1.8438, + "loss": 1.5378, "num_input_tokens_seen": 1928724480, "step": 1635 }, { - "epoch": 0.07502654240887106, - "grad_norm": 0.734375, + "epoch": 0.00424678541937006, + "grad_norm": 0.7890625, "learning_rate": 7.560548697024616e-05, - "loss": 1.8446, + "loss": 1.4558, "num_input_tokens_seen": 1929904128, "step": 1636 }, { - "epoch": 0.07514450867052024, - "grad_norm": 0.796875, + "epoch": 0.004364751681019229, + "grad_norm": 0.8984375, "learning_rate": 7.559837425053822e-05, - "loss": 1.6934, + "loss": 1.5246, "num_input_tokens_seen": 1931083776, "step": 1637 }, { - "epoch": 0.0752624749321694, - "grad_norm": 0.8046875, + "epoch": 0.004482717942668397, + "grad_norm": 0.7890625, "learning_rate": 7.559125611448112e-05, - "loss": 1.6434, + "loss": 1.6329, "num_input_tokens_seen": 1932263424, "step": 1638 }, { - "epoch": 0.07538044119381856, - "grad_norm": 0.73046875, + "epoch": 0.004600684204317565, + "grad_norm": 0.8515625, "learning_rate": 7.558413256315788e-05, - "loss": 1.7177, + "loss": 1.6571, "num_input_tokens_seen": 1933443072, "step": 1639 }, { - "epoch": 0.07549840745546774, - "grad_norm": 0.70703125, + "epoch": 0.0047186504659667335, + "grad_norm": 0.7109375, "learning_rate": 7.557700359765238e-05, - "loss": 1.7676, + "loss": 1.75, "num_input_tokens_seen": 1934622720, "step": 1640 }, { - "epoch": 0.07561637371711691, - "grad_norm": 0.734375, + "epoch": 0.004836616727615902, + "grad_norm": 0.78515625, "learning_rate": 7.556986921904927e-05, - "loss": 1.83, + "loss": 1.7082, "num_input_tokens_seen": 1935802368, "step": 1641 }, { - "epoch": 0.07573433997876608, - "grad_norm": 0.76171875, + "epoch": 0.00495458298926507, + "grad_norm": 0.75390625, "learning_rate": 7.556272942843407e-05, - "loss": 1.6012, + "loss": 1.6003, "num_input_tokens_seen": 1936982016, "step": 1642 }, { - "epoch": 0.07585230624041524, - "grad_norm": 0.765625, + "epoch": 0.005072549250914238, + "grad_norm": 0.80859375, "learning_rate": 7.555558422689312e-05, - "loss": 1.6306, + "loss": 1.6011, "num_input_tokens_seen": 1938161664, "step": 1643 }, { - "epoch": 0.07597027250206441, - "grad_norm": 0.73828125, + "epoch": 0.005190515512563407, + "grad_norm": 0.7109375, "learning_rate": 7.554843361551357e-05, - "loss": 1.6861, + "loss": 1.5739, "num_input_tokens_seen": 1939341312, "step": 1644 }, { - "epoch": 0.07608823876371358, - "grad_norm": 0.796875, + "epoch": 0.0053084817742125754, + "grad_norm": 0.7890625, "learning_rate": 7.55412775953834e-05, - "loss": 1.592, + "loss": 1.5394, "num_input_tokens_seen": 1940520960, "step": 1645 }, { - "epoch": 0.07620620502536275, - "grad_norm": 0.65625, + "epoch": 0.005426448035861744, + "grad_norm": 0.9609375, "learning_rate": 7.553411616759141e-05, - "loss": 1.8981, + "loss": 1.3772, "num_input_tokens_seen": 1941700608, "step": 1646 }, { - "epoch": 0.07632417128701191, - "grad_norm": 0.73828125, + "epoch": 0.005544414297510912, + "grad_norm": 0.81640625, "learning_rate": 7.55269493332272e-05, - "loss": 1.8203, + "loss": 1.4369, "num_input_tokens_seen": 1942880256, "step": 1647 }, { - "epoch": 0.07644213754866108, - "grad_norm": 0.75390625, + "epoch": 0.00566238055916008, + "grad_norm": 0.73828125, "learning_rate": 7.551977709338125e-05, - "loss": 1.6233, + "loss": 1.7093, "num_input_tokens_seen": 1944059904, "step": 1648 }, { - "epoch": 0.07656010381031025, - "grad_norm": 0.77734375, + "epoch": 0.005780346820809248, + "grad_norm": 0.7734375, "learning_rate": 7.55125994491448e-05, - "loss": 1.729, + "loss": 1.5003, "num_input_tokens_seen": 1945239552, "step": 1649 }, { - "epoch": 0.07667807007195943, - "grad_norm": 0.875, + "epoch": 0.0058983130824584165, + "grad_norm": 0.81640625, "learning_rate": 7.550541640160996e-05, - "loss": 1.6082, + "loss": 1.6225, "num_input_tokens_seen": 1946419200, "step": 1650 }, { - "epoch": 0.07679603633360858, - "grad_norm": 0.74609375, + "epoch": 0.0060162793441075856, + "grad_norm": 0.86328125, "learning_rate": 7.549822795186963e-05, - "loss": 1.6944, + "loss": 1.4609, "num_input_tokens_seen": 1947598848, "step": 1651 }, { - "epoch": 0.07691400259525775, - "grad_norm": 0.8359375, + "epoch": 0.006134245605756754, + "grad_norm": 0.80859375, "learning_rate": 7.549103410101754e-05, - "loss": 1.6198, + "loss": 1.5634, "num_input_tokens_seen": 1948778496, "step": 1652 }, { - "epoch": 0.07703196885690693, - "grad_norm": 0.78515625, + "epoch": 0.006252211867405922, + "grad_norm": 0.77734375, "learning_rate": 7.548383485014826e-05, - "loss": 1.612, + "loss": 1.5238, "num_input_tokens_seen": 1949958144, "step": 1653 }, { - "epoch": 0.0771499351185561, - "grad_norm": 0.73828125, + "epoch": 0.00637017812905509, + "grad_norm": 0.78125, "learning_rate": 7.547663020035717e-05, - "loss": 1.695, + "loss": 1.6528, "num_input_tokens_seen": 1951137792, "step": 1654 }, { - "epoch": 0.07726790138020526, - "grad_norm": 0.7421875, + "epoch": 0.006488144390704258, + "grad_norm": 0.765625, "learning_rate": 7.546942015274046e-05, - "loss": 1.6304, + "loss": 1.6072, "num_input_tokens_seen": 1952317440, "step": 1655 }, { - "epoch": 0.07738586764185443, - "grad_norm": 0.78125, + "epoch": 0.006606110652353427, + "grad_norm": 0.7890625, "learning_rate": 7.546220470839512e-05, - "loss": 1.7287, + "loss": 1.4707, "num_input_tokens_seen": 1953497088, "step": 1656 }, { - "epoch": 0.0775038339035036, - "grad_norm": 0.7421875, + "epoch": 0.006724076914002596, + "grad_norm": 0.7109375, "learning_rate": 7.545498386841904e-05, - "loss": 1.7101, + "loss": 1.678, "num_input_tokens_seen": 1954676736, "step": 1657 }, { - "epoch": 0.07762180016515277, - "grad_norm": 0.78125, + "epoch": 0.006842043175651764, + "grad_norm": 0.92578125, "learning_rate": 7.544775763391086e-05, - "loss": 1.5402, + "loss": 1.5031, "num_input_tokens_seen": 1955856384, "step": 1658 }, { - "epoch": 0.07773976642680193, - "grad_norm": 0.6484375, + "epoch": 0.006960009437300932, + "grad_norm": 0.75390625, "learning_rate": 7.544052600597009e-05, - "loss": 1.8927, + "loss": 1.4664, "num_input_tokens_seen": 1957036032, "step": 1659 }, { - "epoch": 0.0778577326884511, - "grad_norm": 0.7421875, + "epoch": 0.0070779756989501, + "grad_norm": 1.0078125, "learning_rate": 7.543328898569698e-05, - "loss": 1.7273, + "loss": 1.5593, "num_input_tokens_seen": 1958215680, "step": 1660 }, { - "epoch": 0.07797569895010027, - "grad_norm": 0.828125, + "epoch": 0.0071959419605992685, + "grad_norm": 0.87109375, "learning_rate": 7.542604657419268e-05, - "loss": 1.7994, + "loss": 1.5318, "num_input_tokens_seen": 1959395328, "step": 1661 }, { - "epoch": 0.07809366521174944, - "grad_norm": 0.703125, + "epoch": 0.007313908222248437, + "grad_norm": 0.80859375, "learning_rate": 7.541879877255915e-05, - "loss": 1.7345, + "loss": 1.6062, "num_input_tokens_seen": 1960574976, "step": 1662 }, { - "epoch": 0.0782116314733986, - "grad_norm": 0.8984375, + "epoch": 0.007431874483897605, + "grad_norm": 0.8125, "learning_rate": 7.541154558189914e-05, - "loss": 1.6323, + "loss": 1.5415, "num_input_tokens_seen": 1961754624, "step": 1663 }, { - "epoch": 0.07832959773504777, + "epoch": 0.007549840745546774, "grad_norm": 0.80078125, "learning_rate": 7.540428700331625e-05, - "loss": 1.5545, + "loss": 1.4204, "num_input_tokens_seen": 1962934272, "step": 1664 }, { - "epoch": 0.07844756399669695, - "grad_norm": 0.74609375, + "epoch": 0.007667807007195942, + "grad_norm": 0.76171875, "learning_rate": 7.539702303791486e-05, - "loss": 1.734, + "loss": 1.5433, "num_input_tokens_seen": 1964113920, "step": 1665 }, { - "epoch": 0.07856553025834612, - "grad_norm": 0.8203125, + "epoch": 0.00778577326884511, + "grad_norm": 0.80859375, "learning_rate": 7.53897536868002e-05, - "loss": 1.7101, + "loss": 1.6512, "num_input_tokens_seen": 1965293568, "step": 1666 }, { - "epoch": 0.07868349651999527, - "grad_norm": 0.71875, + "epoch": 0.007903739530494279, + "grad_norm": 0.81640625, "learning_rate": 7.538247895107835e-05, - "loss": 1.6985, + "loss": 1.5139, "num_input_tokens_seen": 1966473216, "step": 1667 }, { - "epoch": 0.07880146278164445, - "grad_norm": 0.80859375, + "epoch": 0.008021705792143447, + "grad_norm": 0.74609375, "learning_rate": 7.53751988318561e-05, - "loss": 1.6059, + "loss": 1.7058, "num_input_tokens_seen": 1967652864, "step": 1668 }, { - "epoch": 0.07891942904329362, - "grad_norm": 0.79296875, + "epoch": 0.008139672053792615, + "grad_norm": 0.828125, "learning_rate": 7.53679133302412e-05, - "loss": 1.6212, + "loss": 1.5792, "num_input_tokens_seen": 1968832512, "step": 1669 }, { - "epoch": 0.07903739530494279, - "grad_norm": 0.80078125, + "epoch": 0.008257638315441783, + "grad_norm": 0.8046875, "learning_rate": 7.536062244734212e-05, - "loss": 1.7851, + "loss": 1.4524, "num_input_tokens_seen": 1970012160, "step": 1670 }, { - "epoch": 0.07915536156659196, - "grad_norm": 0.81640625, + "epoch": 0.008375604577090951, + "grad_norm": 0.75390625, "learning_rate": 7.535332618426816e-05, - "loss": 1.5273, + "loss": 1.6209, "num_input_tokens_seen": 1971191808, "step": 1671 }, { - "epoch": 0.07927332782824112, - "grad_norm": 0.67578125, + "epoch": 0.00849357083874012, + "grad_norm": 0.71484375, "learning_rate": 7.53460245421295e-05, - "loss": 1.5931, + "loss": 1.5364, "num_input_tokens_seen": 1972371456, "step": 1672 }, { - "epoch": 0.07939129408989029, - "grad_norm": 0.90625, + "epoch": 0.008611537100389288, + "grad_norm": 0.734375, "learning_rate": 7.533871752203708e-05, - "loss": 1.8014, + "loss": 1.6611, "num_input_tokens_seen": 1973551104, "step": 1673 }, { - "epoch": 0.07950926035153946, - "grad_norm": 0.8046875, + "epoch": 0.008729503362038458, + "grad_norm": 0.82421875, "learning_rate": 7.533140512510267e-05, - "loss": 1.7501, + "loss": 1.5247, "num_input_tokens_seen": 1974730752, "step": 1674 }, { - "epoch": 0.07962722661318863, - "grad_norm": 0.76953125, + "epoch": 0.008847469623687626, + "grad_norm": 0.703125, "learning_rate": 7.532408735243887e-05, - "loss": 1.7142, + "loss": 1.6105, "num_input_tokens_seen": 1975910400, "step": 1675 }, { - "epoch": 0.07974519287483779, - "grad_norm": 0.76953125, + "epoch": 0.008965435885336794, + "grad_norm": 0.71484375, "learning_rate": 7.531676420515908e-05, - "loss": 1.6984, + "loss": 1.621, "num_input_tokens_seen": 1977090048, "step": 1676 }, { - "epoch": 0.07986315913648696, - "grad_norm": 0.81640625, + "epoch": 0.009083402146985962, + "grad_norm": 0.71484375, "learning_rate": 7.530943568437753e-05, - "loss": 1.6684, + "loss": 1.5137, "num_input_tokens_seen": 1978269696, "step": 1677 }, { - "epoch": 0.07998112539813614, - "grad_norm": 0.6796875, + "epoch": 0.00920136840863513, + "grad_norm": 0.7265625, "learning_rate": 7.530210179120927e-05, - "loss": 1.7934, + "loss": 1.4861, "num_input_tokens_seen": 1979449344, "step": 1678 }, { - "epoch": 0.08009909165978531, - "grad_norm": 0.765625, + "epoch": 0.009319334670284299, + "grad_norm": 0.74609375, "learning_rate": 7.529476252677016e-05, - "loss": 1.6007, + "loss": 1.6147, "num_input_tokens_seen": 1980628992, "step": 1679 }, { - "epoch": 0.08021705792143446, - "grad_norm": 0.73046875, + "epoch": 0.009437300931933467, + "grad_norm": 0.8046875, "learning_rate": 7.528741789217692e-05, - "loss": 1.7508, + "loss": 1.506, "num_input_tokens_seen": 1981808640, "step": 1680 }, { - "epoch": 0.08033502418308364, - "grad_norm": 0.875, - "learning_rate": 7.528006788854697e-05, - "loss": 1.5685, + "epoch": 0.009555267193582635, + "grad_norm": 0.73046875, + "learning_rate": 7.528006788854697e-05, + "loss": 1.5949, "num_input_tokens_seen": 1982988288, "step": 1681 }, { - "epoch": 0.08045299044473281, - "grad_norm": 0.734375, + "epoch": 0.009673233455231803, + "grad_norm": 0.79296875, "learning_rate": 7.527271251699867e-05, - "loss": 1.732, + "loss": 1.5086, "num_input_tokens_seen": 1984167936, "step": 1682 }, { - "epoch": 0.08057095670638198, - "grad_norm": 0.77734375, + "epoch": 0.009791199716880972, + "grad_norm": 0.79296875, "learning_rate": 7.526535177865118e-05, - "loss": 1.6582, + "loss": 1.5301, "num_input_tokens_seen": 1985347584, "step": 1683 }, { - "epoch": 0.08068892296803114, - "grad_norm": 0.7421875, + "epoch": 0.00990916597853014, + "grad_norm": 0.76953125, "learning_rate": 7.525798567462439e-05, - "loss": 1.6176, + "loss": 1.6647, "num_input_tokens_seen": 1986527232, "step": 1684 }, { - "epoch": 0.08080688922968031, - "grad_norm": 0.7265625, + "epoch": 0.010027132240179308, + "grad_norm": 0.77734375, "learning_rate": 7.525061420603911e-05, - "loss": 1.6954, + "loss": 1.6048, "num_input_tokens_seen": 1987706880, "step": 1685 }, { - "epoch": 0.08092485549132948, - "grad_norm": 0.76953125, + "epoch": 0.010145098501828476, + "grad_norm": 0.87109375, "learning_rate": 7.524323737401688e-05, - "loss": 1.7177, + "loss": 1.7124, "num_input_tokens_seen": 1988886528, "step": 1686 }, { - "epoch": 0.08104282175297865, - "grad_norm": 0.703125, + "epoch": 0.010263064763477646, + "grad_norm": 0.765625, "learning_rate": 7.523585517968013e-05, - "loss": 1.6539, + "loss": 1.6371, "num_input_tokens_seen": 1990066176, "step": 1687 }, { - "epoch": 0.08116078801462781, - "grad_norm": 0.796875, + "epoch": 0.010381031025126814, + "grad_norm": 0.79296875, "learning_rate": 7.522846762415207e-05, - "loss": 1.6314, + "loss": 1.5074, "num_input_tokens_seen": 1991245824, "step": 1688 }, { - "epoch": 0.08127875427627698, - "grad_norm": 0.76171875, + "epoch": 0.010498997286775983, + "grad_norm": 0.734375, "learning_rate": 7.52210747085567e-05, - "loss": 1.6346, + "loss": 1.5505, "num_input_tokens_seen": 1992425472, "step": 1689 }, { - "epoch": 0.08139672053792615, - "grad_norm": 0.80078125, + "epoch": 0.010616963548425151, + "grad_norm": 0.81640625, "learning_rate": 7.521367643401889e-05, - "loss": 1.5953, + "loss": 1.5206, "num_input_tokens_seen": 1993605120, "step": 1690 }, { - "epoch": 0.08151468679957533, - "grad_norm": 0.6953125, + "epoch": 0.010734929810074319, + "grad_norm": 0.66796875, "learning_rate": 7.52062728016643e-05, - "loss": 1.7221, + "loss": 1.6137, "num_input_tokens_seen": 1994784768, "step": 1691 }, { - "epoch": 0.08163265306122448, - "grad_norm": 0.6796875, + "epoch": 0.010852896071723487, + "grad_norm": 0.765625, "learning_rate": 7.519886381261938e-05, - "loss": 1.7315, + "loss": 1.5537, "num_input_tokens_seen": 1995964416, "step": 1692 }, { - "epoch": 0.08175061932287365, - "grad_norm": 0.6171875, + "epoch": 0.010970862333372655, + "grad_norm": 0.74609375, "learning_rate": 7.519144946801145e-05, - "loss": 1.8649, + "loss": 1.5487, "num_input_tokens_seen": 1997144064, "step": 1693 }, { - "epoch": 0.08186858558452283, - "grad_norm": 0.79296875, + "epoch": 0.011088828595021824, + "grad_norm": 0.80078125, "learning_rate": 7.518402976896861e-05, - "loss": 1.5899, + "loss": 1.5808, "num_input_tokens_seen": 1998323712, "step": 1694 }, { - "epoch": 0.081986551846172, - "grad_norm": 0.76171875, + "epoch": 0.011206794856670992, + "grad_norm": 0.7890625, "learning_rate": 7.517660471661976e-05, - "loss": 1.6901, + "loss": 1.5318, "num_input_tokens_seen": 1999503360, "step": 1695 }, { - "epoch": 0.08210451810782117, - "grad_norm": 0.69140625, + "epoch": 0.01132476111832016, + "grad_norm": 0.703125, "learning_rate": 7.516917431209462e-05, - "loss": 1.7991, + "loss": 1.6044, "num_input_tokens_seen": 2000683008, "step": 1696 }, { - "epoch": 0.08222248436947033, - "grad_norm": 0.75, + "epoch": 0.011442727379969328, + "grad_norm": 0.72265625, "learning_rate": 7.51617385565238e-05, - "loss": 1.6787, + "loss": 1.6691, "num_input_tokens_seen": 2001862656, "step": 1697 }, { - "epoch": 0.0823404506311195, - "grad_norm": 0.7421875, + "epoch": 0.011560693641618497, + "grad_norm": 0.8359375, "learning_rate": 7.51542974510386e-05, - "loss": 1.6084, + "loss": 1.4109, "num_input_tokens_seen": 2003042304, "step": 1698 }, { - "epoch": 0.08245841689276867, - "grad_norm": 0.76171875, + "epoch": 0.011678659903267665, + "grad_norm": 0.8359375, "learning_rate": 7.514685099677122e-05, - "loss": 1.667, + "loss": 1.5029, "num_input_tokens_seen": 2004221952, "step": 1699 }, { - "epoch": 0.08257638315441784, - "grad_norm": 0.671875, + "epoch": 0.011796626164916833, + "grad_norm": 0.85546875, "learning_rate": 7.513939919485466e-05, - "loss": 1.6747, + "loss": 1.5524, "num_input_tokens_seen": 2005401600, "step": 1700 }, { - "epoch": 0.082694349416067, - "grad_norm": 0.6953125, + "epoch": 0.011914592426566003, + "grad_norm": 0.82421875, "learning_rate": 7.51319420464227e-05, - "loss": 1.7835, + "loss": 1.4823, "num_input_tokens_seen": 2006581248, "step": 1701 }, { - "epoch": 0.08281231567771617, - "grad_norm": 0.7890625, + "epoch": 0.012032558688215171, + "grad_norm": 0.75390625, "learning_rate": 7.512447955260998e-05, - "loss": 1.6754, + "loss": 1.66, "num_input_tokens_seen": 2007760896, "step": 1702 }, { - "epoch": 0.08293028193936534, - "grad_norm": 0.6953125, + "epoch": 0.01215052494986434, + "grad_norm": 0.80078125, "learning_rate": 7.51170117145519e-05, - "loss": 1.7063, + "loss": 1.6474, "num_input_tokens_seen": 2008940544, "step": 1703 }, { - "epoch": 0.08304824820101452, - "grad_norm": 0.640625, + "epoch": 0.012268491211513508, + "grad_norm": 0.6875, "learning_rate": 7.510953853338474e-05, - "loss": 1.7616, + "loss": 1.6331, "num_input_tokens_seen": 2010120192, "step": 1704 }, { - "epoch": 0.08316621446266367, - "grad_norm": 0.87890625, + "epoch": 0.012386457473162676, + "grad_norm": 0.79296875, "learning_rate": 7.510206001024554e-05, - "loss": 1.5359, + "loss": 1.7039, "num_input_tokens_seen": 2011299840, "step": 1705 }, { - "epoch": 0.08328418072431285, - "grad_norm": 0.7265625, + "epoch": 0.012504423734811844, + "grad_norm": 0.765625, "learning_rate": 7.509457614627217e-05, - "loss": 1.5811, + "loss": 1.5544, "num_input_tokens_seen": 2012479488, "step": 1706 }, { - "epoch": 0.08340214698596202, - "grad_norm": 0.70703125, + "epoch": 0.012622389996461012, + "grad_norm": 0.765625, "learning_rate": 7.50870869426033e-05, - "loss": 1.7884, + "loss": 1.5711, "num_input_tokens_seen": 2013659136, "step": 1707 }, { - "epoch": 0.08352011324761119, - "grad_norm": 0.69140625, + "epoch": 0.01274035625811018, + "grad_norm": 0.77734375, "learning_rate": 7.507959240037844e-05, - "loss": 1.6839, + "loss": 1.5601, "num_input_tokens_seen": 2014838784, "step": 1708 }, { - "epoch": 0.08363807950926035, - "grad_norm": 0.72265625, + "epoch": 0.012858322519759349, + "grad_norm": 0.70703125, "learning_rate": 7.50720925207379e-05, - "loss": 1.7354, + "loss": 1.7537, "num_input_tokens_seen": 2016018432, "step": 1709 }, { - "epoch": 0.08375604577090952, - "grad_norm": 0.72265625, + "epoch": 0.012976288781408517, + "grad_norm": 0.7421875, "learning_rate": 7.506458730482277e-05, - "loss": 1.7445, + "loss": 1.5504, "num_input_tokens_seen": 2017198080, "step": 1710 }, { - "epoch": 0.08387401203255869, - "grad_norm": 0.71875, + "epoch": 0.013094255043057685, + "grad_norm": 0.71484375, "learning_rate": 7.505707675377502e-05, - "loss": 1.7734, + "loss": 1.6148, "num_input_tokens_seen": 2018377728, "step": 1711 }, { - "epoch": 0.08399197829420786, - "grad_norm": 0.81640625, + "epoch": 0.013212221304706853, + "grad_norm": 0.71484375, "learning_rate": 7.504956086873735e-05, - "loss": 1.5439, + "loss": 1.541, "num_input_tokens_seen": 2019557376, "step": 1712 }, { - "epoch": 0.08410994455585702, - "grad_norm": 0.76171875, + "epoch": 0.013330187566356021, + "grad_norm": 0.765625, "learning_rate": 7.504203965085335e-05, - "loss": 1.6082, + "loss": 1.6243, "num_input_tokens_seen": 2020737024, "step": 1713 }, { - "epoch": 0.08422791081750619, - "grad_norm": 0.9375, + "epoch": 0.013448153828005191, + "grad_norm": 0.796875, "learning_rate": 7.503451310126738e-05, - "loss": 1.716, + "loss": 1.4932, "num_input_tokens_seen": 2021916672, "step": 1714 }, { - "epoch": 0.08434587707915536, - "grad_norm": 0.6953125, + "epoch": 0.01356612008965436, + "grad_norm": 0.765625, "learning_rate": 7.50269812211246e-05, - "loss": 1.6431, + "loss": 1.7139, "num_input_tokens_seen": 2023096320, "step": 1715 }, { - "epoch": 0.08446384334080453, - "grad_norm": 0.8984375, + "epoch": 0.013684086351303528, + "grad_norm": 0.8515625, "learning_rate": 7.5019444011571e-05, - "loss": 1.5986, + "loss": 1.5194, "num_input_tokens_seen": 2024275968, "step": 1716 }, { - "epoch": 0.08458180960245369, - "grad_norm": 0.77734375, + "epoch": 0.013802052612952696, + "grad_norm": 0.76171875, "learning_rate": 7.501190147375338e-05, - "loss": 1.6777, + "loss": 1.6824, "num_input_tokens_seen": 2025455616, "step": 1717 }, { - "epoch": 0.08469977586410286, - "grad_norm": 0.81640625, + "epoch": 0.013920018874601864, + "grad_norm": 0.73046875, "learning_rate": 7.500435360881937e-05, - "loss": 1.5908, + "loss": 1.6044, "num_input_tokens_seen": 2026635264, "step": 1718 }, { - "epoch": 0.08481774212575204, - "grad_norm": 0.76953125, + "epoch": 0.014037985136251032, + "grad_norm": 0.66796875, "learning_rate": 7.499680041791737e-05, - "loss": 1.7021, + "loss": 1.5265, "num_input_tokens_seen": 2027814912, "step": 1719 }, { - "epoch": 0.08493570838740121, - "grad_norm": 0.78515625, + "epoch": 0.0141559513979002, + "grad_norm": 0.82421875, "learning_rate": 7.49892419021966e-05, - "loss": 1.5912, + "loss": 1.5371, "num_input_tokens_seen": 2028994560, "step": 1720 }, { - "epoch": 0.08505367464905036, - "grad_norm": 0.76171875, + "epoch": 0.014273917659549369, + "grad_norm": 0.7109375, "learning_rate": 7.498167806280712e-05, - "loss": 1.6427, + "loss": 1.5835, "num_input_tokens_seen": 2030174208, "step": 1721 }, { - "epoch": 0.08517164091069954, - "grad_norm": 0.71484375, + "epoch": 0.014391883921198537, + "grad_norm": 0.7109375, "learning_rate": 7.497410890089976e-05, - "loss": 1.6574, + "loss": 1.5378, "num_input_tokens_seen": 2031353856, "step": 1722 }, { - "epoch": 0.08528960717234871, - "grad_norm": 0.765625, + "epoch": 0.014509850182847705, + "grad_norm": 0.69140625, "learning_rate": 7.496653441762621e-05, - "loss": 1.6093, + "loss": 1.6016, "num_input_tokens_seen": 2032533504, "step": 1723 }, { - "epoch": 0.08540757343399788, - "grad_norm": 0.73828125, + "epoch": 0.014627816444496873, + "grad_norm": 0.7421875, "learning_rate": 7.495895461413891e-05, - "loss": 1.6188, + "loss": 1.5343, "num_input_tokens_seen": 2033713152, "step": 1724 }, { - "epoch": 0.08552553969564705, - "grad_norm": 0.8359375, + "epoch": 0.014745782706146042, + "grad_norm": 0.71484375, "learning_rate": 7.495136949159117e-05, - "loss": 1.4764, + "loss": 1.6498, "num_input_tokens_seen": 2034892800, "step": 1725 }, { - "epoch": 0.08564350595729621, - "grad_norm": 0.8046875, + "epoch": 0.01486374896779521, + "grad_norm": 0.80859375, "learning_rate": 7.494377905113704e-05, - "loss": 1.6112, + "loss": 1.4332, "num_input_tokens_seen": 2036072448, "step": 1726 }, { - "epoch": 0.08576147221894538, - "grad_norm": 0.88671875, + "epoch": 0.014981715229444378, + "grad_norm": 0.66796875, "learning_rate": 7.493618329393145e-05, - "loss": 1.6492, + "loss": 1.6878, "num_input_tokens_seen": 2037252096, "step": 1727 }, { - "epoch": 0.08587943848059455, - "grad_norm": 0.8515625, + "epoch": 0.015099681491093548, + "grad_norm": 0.69921875, "learning_rate": 7.49285822211301e-05, - "loss": 1.5195, + "loss": 1.557, "num_input_tokens_seen": 2038431744, "step": 1728 }, { - "epoch": 0.08599740474224372, - "grad_norm": 0.78125, + "epoch": 0.015217647752742716, + "grad_norm": 0.7109375, "learning_rate": 7.492097583388949e-05, - "loss": 1.6065, + "loss": 1.6027, "num_input_tokens_seen": 2039611392, "step": 1729 }, { - "epoch": 0.08611537100389288, - "grad_norm": 0.90234375, + "epoch": 0.015335614014391884, + "grad_norm": 0.73828125, "learning_rate": 7.491336413336695e-05, - "loss": 1.6643, + "loss": 1.5635, "num_input_tokens_seen": 2040791040, "step": 1730 }, { - "epoch": 0.08623333726554205, - "grad_norm": 0.703125, + "epoch": 0.015453580276041053, + "grad_norm": 0.7890625, "learning_rate": 7.490574712072063e-05, - "loss": 1.6163, + "loss": 1.5244, "num_input_tokens_seen": 2041970688, "step": 1731 }, { - "epoch": 0.08635130352719123, - "grad_norm": 0.7890625, + "epoch": 0.01557154653769022, + "grad_norm": 0.75, "learning_rate": 7.489812479710944e-05, - "loss": 1.5303, + "loss": 1.4318, "num_input_tokens_seen": 2043150336, "step": 1732 }, { - "epoch": 0.0864692697888404, - "grad_norm": 0.86328125, + "epoch": 0.01568951279933939, + "grad_norm": 0.7265625, "learning_rate": 7.489049716369316e-05, - "loss": 1.5475, + "loss": 1.5501, "num_input_tokens_seen": 2044329984, "step": 1733 }, { - "epoch": 0.08658723605048955, - "grad_norm": 0.6875, + "epoch": 0.015807479060988557, + "grad_norm": 0.62890625, "learning_rate": 7.488286422163232e-05, - "loss": 1.6679, + "loss": 1.6179, "num_input_tokens_seen": 2045509632, "step": 1734 }, { - "epoch": 0.08670520231213873, - "grad_norm": 0.8359375, + "epoch": 0.015925445322637725, + "grad_norm": 0.6796875, "learning_rate": 7.48752259720883e-05, - "loss": 1.5547, + "loss": 1.5711, "num_input_tokens_seen": 2046689280, "step": 1735 }, { - "epoch": 0.0868231685737879, - "grad_norm": 0.70703125, + "epoch": 0.016043411584286894, + "grad_norm": 0.71875, "learning_rate": 7.486758241622327e-05, - "loss": 1.7041, + "loss": 1.5267, "num_input_tokens_seen": 2047868928, "step": 1736 }, { - "epoch": 0.08694113483543707, - "grad_norm": 0.7734375, + "epoch": 0.016161377845936062, + "grad_norm": 0.72265625, "learning_rate": 7.48599335552002e-05, - "loss": 1.7112, + "loss": 1.622, "num_input_tokens_seen": 2049048576, "step": 1737 }, { - "epoch": 0.08705910109708623, - "grad_norm": 0.625, + "epoch": 0.01627934410758523, + "grad_norm": 0.98828125, "learning_rate": 7.485227939018287e-05, - "loss": 1.8023, + "loss": 1.5862, "num_input_tokens_seen": 2050228224, "step": 1738 }, { - "epoch": 0.0871770673587354, - "grad_norm": 0.859375, + "epoch": 0.0163973103692344, + "grad_norm": 0.78125, "learning_rate": 7.484461992233592e-05, - "loss": 1.5649, + "loss": 1.5703, "num_input_tokens_seen": 2051407872, "step": 1739 }, { - "epoch": 0.08729503362038457, - "grad_norm": 0.78125, + "epoch": 0.016515276630883566, + "grad_norm": 0.80078125, "learning_rate": 7.48369551528247e-05, - "loss": 1.6138, + "loss": 1.6746, "num_input_tokens_seen": 2052587520, "step": 1740 }, { - "epoch": 0.08741299988203374, - "grad_norm": 0.69140625, + "epoch": 0.016633242892532735, + "grad_norm": 0.85546875, "learning_rate": 7.482928508281543e-05, - "loss": 1.7319, + "loss": 1.4853, "num_input_tokens_seen": 2053767168, "step": 1741 }, { - "epoch": 0.0875309661436829, - "grad_norm": 0.77734375, + "epoch": 0.016751209154181903, + "grad_norm": 0.7265625, "learning_rate": 7.482160971347514e-05, - "loss": 1.6588, + "loss": 1.5609, "num_input_tokens_seen": 2054946816, "step": 1742 }, { - "epoch": 0.08764893240533207, - "grad_norm": 0.7265625, + "epoch": 0.01686917541583107, + "grad_norm": 0.796875, "learning_rate": 7.481392904597161e-05, - "loss": 1.5877, + "loss": 1.4062, "num_input_tokens_seen": 2056126464, "step": 1743 }, { - "epoch": 0.08776689866698124, - "grad_norm": 0.734375, + "epoch": 0.01698714167748024, + "grad_norm": 0.6796875, "learning_rate": 7.480624308147352e-05, - "loss": 1.6643, + "loss": 1.6638, "num_input_tokens_seen": 2057306112, "step": 1744 }, { - "epoch": 0.08788486492863042, - "grad_norm": 0.640625, + "epoch": 0.017105107939129408, + "grad_norm": 0.84375, "learning_rate": 7.479855182115027e-05, - "loss": 1.6394, + "loss": 1.4458, "num_input_tokens_seen": 2058485760, "step": 1745 }, { - "epoch": 0.08800283119027957, - "grad_norm": 0.71484375, + "epoch": 0.017223074200778576, + "grad_norm": 0.65625, "learning_rate": 7.479085526617209e-05, - "loss": 1.5986, + "loss": 1.7725, "num_input_tokens_seen": 2059665408, "step": 1746 }, { - "epoch": 0.08812079745192875, - "grad_norm": 0.6875, + "epoch": 0.017341040462427744, + "grad_norm": 0.80078125, "learning_rate": 7.478315341771003e-05, - "loss": 1.7305, + "loss": 1.5117, "num_input_tokens_seen": 2060845056, "step": 1747 }, { - "epoch": 0.08823876371357792, - "grad_norm": 0.84375, + "epoch": 0.017459006724076916, + "grad_norm": 0.68359375, "learning_rate": 7.477544627693595e-05, - "loss": 1.8422, + "loss": 1.5183, "num_input_tokens_seen": 2062024704, "step": 1748 }, { - "epoch": 0.08835672997522709, - "grad_norm": 0.703125, + "epoch": 0.017576972985726084, + "grad_norm": 0.828125, "learning_rate": 7.476773384502251e-05, - "loss": 1.6941, + "loss": 1.5633, "num_input_tokens_seen": 2063204352, "step": 1749 }, { - "epoch": 0.08847469623687626, - "grad_norm": 0.6953125, + "epoch": 0.017694939247375252, + "grad_norm": 0.66796875, "learning_rate": 7.476001612314314e-05, - "loss": 1.8083, + "loss": 1.5814, "num_input_tokens_seen": 2064384000, "step": 1750 }, { - "epoch": 0.08859266249852542, - "grad_norm": 0.828125, + "epoch": 0.01781290550902442, + "grad_norm": 0.80859375, "learning_rate": 7.475229311247212e-05, - "loss": 1.599, + "loss": 1.5078, "num_input_tokens_seen": 2065563648, "step": 1751 }, { - "epoch": 0.08871062876017459, - "grad_norm": 0.66796875, + "epoch": 0.01793087177067359, + "grad_norm": 0.7421875, "learning_rate": 7.474456481418452e-05, - "loss": 1.6906, + "loss": 1.5594, "num_input_tokens_seen": 2066743296, "step": 1752 }, { - "epoch": 0.08882859502182376, - "grad_norm": 0.87890625, + "epoch": 0.018048838032322757, + "grad_norm": 0.83203125, "learning_rate": 7.473683122945622e-05, - "loss": 1.5989, + "loss": 1.5369, "num_input_tokens_seen": 2067922944, "step": 1753 }, { - "epoch": 0.08894656128347293, - "grad_norm": 0.72265625, + "epoch": 0.018166804293971925, + "grad_norm": 0.69921875, "learning_rate": 7.472909235946388e-05, - "loss": 1.6321, + "loss": 1.5476, "num_input_tokens_seen": 2069102592, "step": 1754 }, { - "epoch": 0.08906452754512209, - "grad_norm": 0.796875, + "epoch": 0.018284770555621093, + "grad_norm": 0.71875, "learning_rate": 7.472134820538498e-05, - "loss": 1.542, + "loss": 1.525, "num_input_tokens_seen": 2070282240, "step": 1755 }, { - "epoch": 0.08918249380677126, - "grad_norm": 0.71875, + "epoch": 0.01840273681727026, + "grad_norm": 0.75, "learning_rate": 7.471359876839782e-05, - "loss": 1.6988, + "loss": 1.6491, "num_input_tokens_seen": 2071461888, "step": 1756 }, { - "epoch": 0.08930046006842043, - "grad_norm": 0.8125, + "epoch": 0.01852070307891943, + "grad_norm": 0.7109375, "learning_rate": 7.470584404968149e-05, - "loss": 1.6583, + "loss": 1.5732, "num_input_tokens_seen": 2072641536, "step": 1757 }, { - "epoch": 0.0894184263300696, - "grad_norm": 0.76953125, + "epoch": 0.018638669340568598, + "grad_norm": 0.71875, "learning_rate": 7.469808405041587e-05, - "loss": 1.6195, + "loss": 1.5811, "num_input_tokens_seen": 2073821184, "step": 1758 }, { - "epoch": 0.08953639259171876, - "grad_norm": 0.76171875, + "epoch": 0.018756635602217766, + "grad_norm": 0.65625, "learning_rate": 7.469031877178167e-05, - "loss": 1.8069, + "loss": 1.6869, "num_input_tokens_seen": 2075000832, "step": 1759 }, { - "epoch": 0.08965435885336794, - "grad_norm": 0.875, + "epoch": 0.018874601863866934, + "grad_norm": 0.64453125, "learning_rate": 7.468254821496037e-05, - "loss": 1.5791, + "loss": 1.7133, "num_input_tokens_seen": 2076180480, "step": 1760 }, { - "epoch": 0.08977232511501711, - "grad_norm": 0.7109375, + "epoch": 0.018992568125516102, + "grad_norm": 0.6953125, "learning_rate": 7.46747723811343e-05, - "loss": 1.6165, + "loss": 1.5979, "num_input_tokens_seen": 2077360128, "step": 1761 }, { - "epoch": 0.08989029137666628, - "grad_norm": 0.84375, + "epoch": 0.01911053438716527, + "grad_norm": 0.64453125, "learning_rate": 7.466699127148653e-05, - "loss": 1.7364, + "loss": 1.6148, "num_input_tokens_seen": 2078539776, "step": 1762 }, { - "epoch": 0.09000825763831544, - "grad_norm": 0.73046875, + "epoch": 0.01922850064881444, + "grad_norm": 0.67578125, "learning_rate": 7.465920488720099e-05, - "loss": 1.7297, + "loss": 1.5162, "num_input_tokens_seen": 2079719424, "step": 1763 }, { - "epoch": 0.09012622389996461, - "grad_norm": 0.77734375, + "epoch": 0.019346466910463607, + "grad_norm": 0.7265625, "learning_rate": 7.465141322946241e-05, - "loss": 1.6437, + "loss": 1.4266, "num_input_tokens_seen": 2080899072, "step": 1764 }, { - "epoch": 0.09024419016161378, - "grad_norm": 0.71875, + "epoch": 0.019464433172112775, + "grad_norm": 0.703125, "learning_rate": 7.464361629945625e-05, - "loss": 1.7119, + "loss": 1.6248, "num_input_tokens_seen": 2082078720, "step": 1765 }, { - "epoch": 0.09036215642326295, - "grad_norm": 0.69921875, + "epoch": 0.019582399433761943, + "grad_norm": 0.70703125, "learning_rate": 7.463581409836888e-05, - "loss": 1.8534, + "loss": 1.4722, "num_input_tokens_seen": 2083258368, "step": 1766 }, { - "epoch": 0.09048012268491211, - "grad_norm": 0.796875, + "epoch": 0.01970036569541111, + "grad_norm": 0.73046875, "learning_rate": 7.462800662738739e-05, - "loss": 1.629, + "loss": 1.5744, "num_input_tokens_seen": 2084438016, "step": 1767 }, { - "epoch": 0.09059808894656128, - "grad_norm": 0.67578125, + "epoch": 0.01981833195706028, + "grad_norm": 0.703125, "learning_rate": 7.46201938876997e-05, - "loss": 1.7401, + "loss": 1.5756, "num_input_tokens_seen": 2085617664, "step": 1768 }, { - "epoch": 0.09071605520821045, - "grad_norm": 0.70703125, + "epoch": 0.019936298218709448, + "grad_norm": 0.68359375, "learning_rate": 7.461237588049453e-05, - "loss": 1.6565, + "loss": 1.4826, "num_input_tokens_seen": 2086797312, "step": 1769 }, { - "epoch": 0.09083402146985962, - "grad_norm": 0.80078125, + "epoch": 0.020054264480358616, + "grad_norm": 0.74609375, "learning_rate": 7.460455260696142e-05, - "loss": 1.5342, + "loss": 1.7039, "num_input_tokens_seen": 2087976960, "step": 1770 }, { - "epoch": 0.09095198773150878, - "grad_norm": 0.73828125, + "epoch": 0.020172230742007784, + "grad_norm": 0.75390625, "learning_rate": 7.459672406829068e-05, - "loss": 1.67, + "loss": 1.4206, "num_input_tokens_seen": 2089156608, "step": 1771 }, { - "epoch": 0.09106995399315795, - "grad_norm": 0.7109375, + "epoch": 0.020290197003656953, + "grad_norm": 0.71875, "learning_rate": 7.458889026567342e-05, - "loss": 1.7059, + "loss": 1.6331, "num_input_tokens_seen": 2090336256, "step": 1772 }, { - "epoch": 0.09118792025480713, - "grad_norm": 0.72265625, + "epoch": 0.02040816326530612, + "grad_norm": 0.7421875, "learning_rate": 7.458105120030159e-05, - "loss": 1.679, + "loss": 1.5279, "num_input_tokens_seen": 2091515904, "step": 1773 }, { - "epoch": 0.0913058865164563, + "epoch": 0.020526129526955292, "grad_norm": 0.6640625, "learning_rate": 7.457320687336792e-05, - "loss": 1.7183, + "loss": 1.6853, "num_input_tokens_seen": 2092695552, "step": 1774 }, { - "epoch": 0.09142385277810545, - "grad_norm": 0.73828125, + "epoch": 0.02064409578860446, + "grad_norm": 0.80859375, "learning_rate": 7.45653572860659e-05, - "loss": 1.6219, + "loss": 1.5579, "num_input_tokens_seen": 2093875200, "step": 1775 }, { - "epoch": 0.09154181903975463, - "grad_norm": 0.63671875, + "epoch": 0.02076206205025363, + "grad_norm": 0.6953125, "learning_rate": 7.455750243958989e-05, - "loss": 1.7332, + "loss": 1.6844, "num_input_tokens_seen": 2095054848, "step": 1776 }, { - "epoch": 0.0916597853014038, - "grad_norm": 0.8359375, + "epoch": 0.020880028311902797, + "grad_norm": 0.76953125, "learning_rate": 7.4549642335135e-05, - "loss": 1.4434, + "loss": 1.6191, "num_input_tokens_seen": 2096234496, "step": 1777 }, { - "epoch": 0.09177775156305297, - "grad_norm": 0.68359375, + "epoch": 0.020997994573551965, + "grad_norm": 0.83984375, "learning_rate": 7.454177697389717e-05, - "loss": 1.6202, + "loss": 1.5507, "num_input_tokens_seen": 2097414144, "step": 1778 }, { - "epoch": 0.09189571782470214, - "grad_norm": 0.7734375, + "epoch": 0.021115960835201134, + "grad_norm": 0.82421875, "learning_rate": 7.453390635707312e-05, - "loss": 1.7762, + "loss": 1.4706, "num_input_tokens_seen": 2098593792, "step": 1779 }, { - "epoch": 0.0920136840863513, - "grad_norm": 0.70703125, + "epoch": 0.021233927096850302, + "grad_norm": 0.6484375, "learning_rate": 7.452603048586039e-05, - "loss": 1.6236, + "loss": 1.5721, "num_input_tokens_seen": 2099773440, "step": 1780 }, { - "epoch": 0.09213165034800047, - "grad_norm": 0.734375, + "epoch": 0.02135189335849947, + "grad_norm": 0.8671875, "learning_rate": 7.451814936145728e-05, - "loss": 1.5959, + "loss": 1.344, "num_input_tokens_seen": 2100953088, "step": 1781 }, { - "epoch": 0.09224961660964964, - "grad_norm": 0.72265625, + "epoch": 0.021469859620148638, + "grad_norm": 0.6953125, "learning_rate": 7.451026298506294e-05, - "loss": 1.6744, + "loss": 1.5718, "num_input_tokens_seen": 2102132736, "step": 1782 }, { - "epoch": 0.09236758287129881, - "grad_norm": 0.71484375, + "epoch": 0.021587825881797806, + "grad_norm": 0.79296875, "learning_rate": 7.450237135787728e-05, - "loss": 1.6593, + "loss": 1.5887, "num_input_tokens_seen": 2103312384, "step": 1783 }, { - "epoch": 0.09248554913294797, - "grad_norm": 0.68359375, + "epoch": 0.021705792143446975, + "grad_norm": 0.77734375, "learning_rate": 7.449447448110104e-05, - "loss": 1.5661, + "loss": 1.5317, "num_input_tokens_seen": 2104492032, "step": 1784 }, { - "epoch": 0.09260351539459714, - "grad_norm": 0.71484375, + "epoch": 0.021823758405096143, + "grad_norm": 0.78515625, "learning_rate": 7.448657235593572e-05, - "loss": 1.7246, + "loss": 1.645, "num_input_tokens_seen": 2105671680, "step": 1785 }, { - "epoch": 0.09272148165624632, - "grad_norm": 0.75, + "epoch": 0.02194172466674531, + "grad_norm": 0.76953125, "learning_rate": 7.447866498358366e-05, - "loss": 1.6043, + "loss": 1.533, "num_input_tokens_seen": 2106851328, "step": 1786 }, { - "epoch": 0.09283944791789549, - "grad_norm": 0.72265625, + "epoch": 0.02205969092839448, + "grad_norm": 0.71875, "learning_rate": 7.447075236524797e-05, - "loss": 1.6484, + "loss": 1.6696, "num_input_tokens_seen": 2108030976, "step": 1787 }, { - "epoch": 0.09295741417954465, - "grad_norm": 0.75, + "epoch": 0.022177657190043647, + "grad_norm": 0.87890625, "learning_rate": 7.446283450213256e-05, - "loss": 1.5601, + "loss": 1.4246, "num_input_tokens_seen": 2109210624, "step": 1788 }, { - "epoch": 0.09307538044119382, - "grad_norm": 0.74609375, + "epoch": 0.022295623451692816, + "grad_norm": 0.703125, "learning_rate": 7.445491139544218e-05, - "loss": 1.619, + "loss": 1.6228, "num_input_tokens_seen": 2110390272, "step": 1789 }, { - "epoch": 0.09319334670284299, - "grad_norm": 0.6953125, + "epoch": 0.022413589713341984, + "grad_norm": 0.828125, "learning_rate": 7.444698304638229e-05, - "loss": 1.6221, + "loss": 1.6991, "num_input_tokens_seen": 2111569920, "step": 1790 }, { - "epoch": 0.09331131296449216, - "grad_norm": 0.73828125, + "epoch": 0.022531555974991152, + "grad_norm": 0.8359375, "learning_rate": 7.443904945615924e-05, - "loss": 1.548, + "loss": 1.5197, "num_input_tokens_seen": 2112749568, "step": 1791 }, { - "epoch": 0.09342927922614132, - "grad_norm": 0.7109375, + "epoch": 0.02264952223664032, + "grad_norm": 0.7890625, "learning_rate": 7.443111062598013e-05, - "loss": 1.6161, + "loss": 1.5836, "num_input_tokens_seen": 2113929216, "step": 1792 + }, + { + "epoch": 0.02276748849828949, + "grad_norm": 0.8203125, + "learning_rate": 7.442316655705285e-05, + "loss": 1.6029, + "num_input_tokens_seen": 2115108864, + "step": 1793 + }, + { + "epoch": 0.022885454759938657, + "grad_norm": 0.8125, + "learning_rate": 7.441521725058613e-05, + "loss": 1.6011, + "num_input_tokens_seen": 2116288512, + "step": 1794 + }, + { + "epoch": 0.023003421021587825, + "grad_norm": 0.75390625, + "learning_rate": 7.440726270778945e-05, + "loss": 1.6227, + "num_input_tokens_seen": 2117468160, + "step": 1795 + }, + { + "epoch": 0.023121387283236993, + "grad_norm": 0.8046875, + "learning_rate": 7.43993029298731e-05, + "loss": 1.4413, + "num_input_tokens_seen": 2118647808, + "step": 1796 + }, + { + "epoch": 0.02323935354488616, + "grad_norm": 0.69140625, + "learning_rate": 7.43913379180482e-05, + "loss": 1.5052, + "num_input_tokens_seen": 2119827456, + "step": 1797 + }, + { + "epoch": 0.02335731980653533, + "grad_norm": 0.71484375, + "learning_rate": 7.438336767352662e-05, + "loss": 1.5939, + "num_input_tokens_seen": 2121007104, + "step": 1798 + }, + { + "epoch": 0.023475286068184498, + "grad_norm": 0.68359375, + "learning_rate": 7.437539219752105e-05, + "loss": 1.649, + "num_input_tokens_seen": 2122186752, + "step": 1799 + }, + { + "epoch": 0.023593252329833666, + "grad_norm": 0.76953125, + "learning_rate": 7.436741149124496e-05, + "loss": 1.5859, + "num_input_tokens_seen": 2123366400, + "step": 1800 + }, + { + "epoch": 0.023593252329833666, + "eval_wikipedia_loss": 2.2597548961639404, + "eval_wikipedia_runtime": 162.0396, + "eval_wikipedia_samples_per_second": 4.332, + "eval_wikipedia_steps_per_second": 0.185, + "num_input_tokens_seen": 2123366400, + "step": 1800 + }, + { + "epoch": 0.023593252329833666, + "eval_toxicity_loss": 4.024182319641113, + "eval_toxicity_runtime": 0.9591, + "eval_toxicity_samples_per_second": 2.085, + "eval_toxicity_steps_per_second": 1.043, + "num_input_tokens_seen": 2123366400, + "step": 1800 + }, + { + "epoch": 0.023711218591482838, + "grad_norm": 0.73046875, + "learning_rate": 7.435942555591265e-05, + "loss": 1.5798, + "num_input_tokens_seen": 2124546048, + "step": 1801 + }, + { + "epoch": 0.023829184853132006, + "grad_norm": 0.8203125, + "learning_rate": 7.435143439273918e-05, + "loss": 1.6395, + "num_input_tokens_seen": 2125725696, + "step": 1802 + }, + { + "epoch": 0.023947151114781174, + "grad_norm": 0.7421875, + "learning_rate": 7.434343800294041e-05, + "loss": 1.6122, + "num_input_tokens_seen": 2126905344, + "step": 1803 + }, + { + "epoch": 0.024065117376430342, + "grad_norm": 0.76171875, + "learning_rate": 7.433543638773302e-05, + "loss": 1.5967, + "num_input_tokens_seen": 2128084992, + "step": 1804 + }, + { + "epoch": 0.02418308363807951, + "grad_norm": 0.7734375, + "learning_rate": 7.432742954833445e-05, + "loss": 1.5463, + "num_input_tokens_seen": 2129264640, + "step": 1805 + }, + { + "epoch": 0.02430104989972868, + "grad_norm": 0.76171875, + "learning_rate": 7.431941748596297e-05, + "loss": 1.5567, + "num_input_tokens_seen": 2130444288, + "step": 1806 + }, + { + "epoch": 0.024419016161377847, + "grad_norm": 0.70703125, + "learning_rate": 7.431140020183763e-05, + "loss": 1.5494, + "num_input_tokens_seen": 2131623936, + "step": 1807 + }, + { + "epoch": 0.024536982423027015, + "grad_norm": 0.76171875, + "learning_rate": 7.430337769717824e-05, + "loss": 1.6508, + "num_input_tokens_seen": 2132803584, + "step": 1808 + }, + { + "epoch": 0.024654948684676183, + "grad_norm": 0.65234375, + "learning_rate": 7.429534997320546e-05, + "loss": 1.7077, + "num_input_tokens_seen": 2133983232, + "step": 1809 + }, + { + "epoch": 0.02477291494632535, + "grad_norm": 0.8046875, + "learning_rate": 7.428731703114075e-05, + "loss": 1.4564, + "num_input_tokens_seen": 2135162880, + "step": 1810 + }, + { + "epoch": 0.02489088120797452, + "grad_norm": 0.68359375, + "learning_rate": 7.427927887220629e-05, + "loss": 1.6478, + "num_input_tokens_seen": 2136342528, + "step": 1811 + }, + { + "epoch": 0.025008847469623688, + "grad_norm": 0.6953125, + "learning_rate": 7.427123549762511e-05, + "loss": 1.6442, + "num_input_tokens_seen": 2137522176, + "step": 1812 + }, + { + "epoch": 0.025126813731272856, + "grad_norm": 0.77734375, + "learning_rate": 7.426318690862104e-05, + "loss": 1.4693, + "num_input_tokens_seen": 2138701824, + "step": 1813 + }, + { + "epoch": 0.025244779992922024, + "grad_norm": 0.69140625, + "learning_rate": 7.425513310641867e-05, + "loss": 1.5982, + "num_input_tokens_seen": 2139881472, + "step": 1814 + }, + { + "epoch": 0.025362746254571193, + "grad_norm": 0.7421875, + "learning_rate": 7.42470740922434e-05, + "loss": 1.4796, + "num_input_tokens_seen": 2141061120, + "step": 1815 + }, + { + "epoch": 0.02548071251622036, + "grad_norm": 0.8203125, + "learning_rate": 7.423900986732143e-05, + "loss": 1.4444, + "num_input_tokens_seen": 2142240768, + "step": 1816 + }, + { + "epoch": 0.02559867877786953, + "grad_norm": 0.6640625, + "learning_rate": 7.423094043287974e-05, + "loss": 1.5656, + "num_input_tokens_seen": 2143420416, + "step": 1817 + }, + { + "epoch": 0.025716645039518697, + "grad_norm": 0.78515625, + "learning_rate": 7.422286579014609e-05, + "loss": 1.4905, + "num_input_tokens_seen": 2144600064, + "step": 1818 + }, + { + "epoch": 0.025834611301167865, + "grad_norm": 0.68359375, + "learning_rate": 7.421478594034907e-05, + "loss": 1.688, + "num_input_tokens_seen": 2145779712, + "step": 1819 + }, + { + "epoch": 0.025952577562817034, + "grad_norm": 0.82421875, + "learning_rate": 7.420670088471803e-05, + "loss": 1.5098, + "num_input_tokens_seen": 2146959360, + "step": 1820 + }, + { + "epoch": 0.026070543824466202, + "grad_norm": 0.7109375, + "learning_rate": 7.419861062448314e-05, + "loss": 1.5118, + "num_input_tokens_seen": 2148139008, + "step": 1821 + }, + { + "epoch": 0.02618851008611537, + "grad_norm": 0.7734375, + "learning_rate": 7.419051516087535e-05, + "loss": 1.5599, + "num_input_tokens_seen": 2149318656, + "step": 1822 + }, + { + "epoch": 0.026306476347764538, + "grad_norm": 0.72265625, + "learning_rate": 7.418241449512638e-05, + "loss": 1.5937, + "num_input_tokens_seen": 2150498304, + "step": 1823 + }, + { + "epoch": 0.026424442609413706, + "grad_norm": 0.72265625, + "learning_rate": 7.417430862846875e-05, + "loss": 1.5222, + "num_input_tokens_seen": 2151677952, + "step": 1824 + }, + { + "epoch": 0.026542408871062875, + "grad_norm": 0.76171875, + "learning_rate": 7.416619756213581e-05, + "loss": 1.5716, + "num_input_tokens_seen": 2152857600, + "step": 1825 + }, + { + "epoch": 0.026660375132712043, + "grad_norm": 0.74609375, + "learning_rate": 7.415808129736164e-05, + "loss": 1.5728, + "num_input_tokens_seen": 2154037248, + "step": 1826 + }, + { + "epoch": 0.02677834139436121, + "grad_norm": 0.6484375, + "learning_rate": 7.414995983538116e-05, + "loss": 1.7184, + "num_input_tokens_seen": 2155216896, + "step": 1827 + }, + { + "epoch": 0.026896307656010383, + "grad_norm": 0.72265625, + "learning_rate": 7.414183317743008e-05, + "loss": 1.5229, + "num_input_tokens_seen": 2156396544, + "step": 1828 + }, + { + "epoch": 0.02701427391765955, + "grad_norm": 0.6875, + "learning_rate": 7.413370132474485e-05, + "loss": 1.5489, + "num_input_tokens_seen": 2157576192, + "step": 1829 + }, + { + "epoch": 0.02713224017930872, + "grad_norm": 0.6484375, + "learning_rate": 7.412556427856276e-05, + "loss": 1.642, + "num_input_tokens_seen": 2158755840, + "step": 1830 + }, + { + "epoch": 0.027250206440957887, + "grad_norm": 0.6640625, + "learning_rate": 7.411742204012188e-05, + "loss": 1.6251, + "num_input_tokens_seen": 2159935488, + "step": 1831 + }, + { + "epoch": 0.027368172702607056, + "grad_norm": 0.73046875, + "learning_rate": 7.410927461066107e-05, + "loss": 1.5202, + "num_input_tokens_seen": 2161115136, + "step": 1832 + }, + { + "epoch": 0.027486138964256224, + "grad_norm": 0.82421875, + "learning_rate": 7.410112199141994e-05, + "loss": 1.5337, + "num_input_tokens_seen": 2162294784, + "step": 1833 + }, + { + "epoch": 0.027604105225905392, + "grad_norm": 0.80859375, + "learning_rate": 7.409296418363897e-05, + "loss": 1.4675, + "num_input_tokens_seen": 2163474432, + "step": 1834 + }, + { + "epoch": 0.02772207148755456, + "grad_norm": 0.7109375, + "learning_rate": 7.408480118855935e-05, + "loss": 1.6147, + "num_input_tokens_seen": 2164654080, + "step": 1835 + }, + { + "epoch": 0.02784003774920373, + "grad_norm": 0.95703125, + "learning_rate": 7.407663300742309e-05, + "loss": 1.4314, + "num_input_tokens_seen": 2165833728, + "step": 1836 + }, + { + "epoch": 0.027958004010852897, + "grad_norm": 0.80078125, + "learning_rate": 7.406845964147303e-05, + "loss": 1.5776, + "num_input_tokens_seen": 2167013376, + "step": 1837 + }, + { + "epoch": 0.028075970272502065, + "grad_norm": 0.78125, + "learning_rate": 7.406028109195273e-05, + "loss": 1.6119, + "num_input_tokens_seen": 2168193024, + "step": 1838 + }, + { + "epoch": 0.028193936534151233, + "grad_norm": 0.828125, + "learning_rate": 7.405209736010656e-05, + "loss": 1.5874, + "num_input_tokens_seen": 2169372672, + "step": 1839 + }, + { + "epoch": 0.0283119027958004, + "grad_norm": 0.75, + "learning_rate": 7.404390844717971e-05, + "loss": 1.583, + "num_input_tokens_seen": 2170552320, + "step": 1840 + }, + { + "epoch": 0.02842986905744957, + "grad_norm": 0.6328125, + "learning_rate": 7.403571435441814e-05, + "loss": 1.6481, + "num_input_tokens_seen": 2171731968, + "step": 1841 + }, + { + "epoch": 0.028547835319098738, + "grad_norm": 0.71875, + "learning_rate": 7.402751508306858e-05, + "loss": 1.5372, + "num_input_tokens_seen": 2172911616, + "step": 1842 + }, + { + "epoch": 0.028665801580747906, + "grad_norm": 0.6484375, + "learning_rate": 7.401931063437855e-05, + "loss": 1.5178, + "num_input_tokens_seen": 2174091264, + "step": 1843 + }, + { + "epoch": 0.028783767842397074, + "grad_norm": 0.68359375, + "learning_rate": 7.40111010095964e-05, + "loss": 1.5221, + "num_input_tokens_seen": 2175270912, + "step": 1844 + }, + { + "epoch": 0.028901734104046242, + "grad_norm": 0.65625, + "learning_rate": 7.400288620997123e-05, + "loss": 1.7355, + "num_input_tokens_seen": 2176450560, + "step": 1845 + }, + { + "epoch": 0.02901970036569541, + "grad_norm": 0.68359375, + "learning_rate": 7.399466623675292e-05, + "loss": 1.5432, + "num_input_tokens_seen": 2177630208, + "step": 1846 + }, + { + "epoch": 0.02913766662734458, + "grad_norm": 0.6796875, + "learning_rate": 7.398644109119214e-05, + "loss": 1.4819, + "num_input_tokens_seen": 2178809856, + "step": 1847 + }, + { + "epoch": 0.029255632888993747, + "grad_norm": 0.703125, + "learning_rate": 7.39782107745404e-05, + "loss": 1.5471, + "num_input_tokens_seen": 2179989504, + "step": 1848 + }, + { + "epoch": 0.029373599150642915, + "grad_norm": 0.7265625, + "learning_rate": 7.396997528804994e-05, + "loss": 1.6242, + "num_input_tokens_seen": 2181169152, + "step": 1849 + }, + { + "epoch": 0.029491565412292083, + "grad_norm": 0.6328125, + "learning_rate": 7.396173463297379e-05, + "loss": 1.5332, + "num_input_tokens_seen": 2182348800, + "step": 1850 + }, + { + "epoch": 0.02960953167394125, + "grad_norm": 0.66796875, + "learning_rate": 7.395348881056578e-05, + "loss": 1.4848, + "num_input_tokens_seen": 2183528448, + "step": 1851 + }, + { + "epoch": 0.02972749793559042, + "grad_norm": 0.71484375, + "learning_rate": 7.394523782208053e-05, + "loss": 1.5674, + "num_input_tokens_seen": 2184708096, + "step": 1852 + }, + { + "epoch": 0.029845464197239588, + "grad_norm": 0.703125, + "learning_rate": 7.393698166877346e-05, + "loss": 1.7141, + "num_input_tokens_seen": 2185887744, + "step": 1853 + }, + { + "epoch": 0.029963430458888756, + "grad_norm": 0.68359375, + "learning_rate": 7.39287203519007e-05, + "loss": 1.5141, + "num_input_tokens_seen": 2187067392, + "step": 1854 + }, + { + "epoch": 0.030081396720537928, + "grad_norm": 0.6640625, + "learning_rate": 7.39204538727193e-05, + "loss": 1.6283, + "num_input_tokens_seen": 2188247040, + "step": 1855 + }, + { + "epoch": 0.030199362982187096, + "grad_norm": 0.66015625, + "learning_rate": 7.391218223248695e-05, + "loss": 1.7063, + "num_input_tokens_seen": 2189426688, + "step": 1856 + }, + { + "epoch": 0.030317329243836264, + "grad_norm": 0.765625, + "learning_rate": 7.390390543246224e-05, + "loss": 1.5577, + "num_input_tokens_seen": 2190606336, + "step": 1857 + }, + { + "epoch": 0.030435295505485432, + "grad_norm": 0.7578125, + "learning_rate": 7.389562347390447e-05, + "loss": 1.5518, + "num_input_tokens_seen": 2191785984, + "step": 1858 + }, + { + "epoch": 0.0305532617671346, + "grad_norm": 0.7890625, + "learning_rate": 7.388733635807378e-05, + "loss": 1.5015, + "num_input_tokens_seen": 2192965632, + "step": 1859 + }, + { + "epoch": 0.03067122802878377, + "grad_norm": 0.6640625, + "learning_rate": 7.387904408623103e-05, + "loss": 1.4798, + "num_input_tokens_seen": 2194145280, + "step": 1860 + }, + { + "epoch": 0.030789194290432937, + "grad_norm": 0.7578125, + "learning_rate": 7.387074665963794e-05, + "loss": 1.5943, + "num_input_tokens_seen": 2195324928, + "step": 1861 + }, + { + "epoch": 0.030907160552082105, + "grad_norm": 0.6796875, + "learning_rate": 7.386244407955695e-05, + "loss": 1.6022, + "num_input_tokens_seen": 2196504576, + "step": 1862 + }, + { + "epoch": 0.031025126813731273, + "grad_norm": 0.7421875, + "learning_rate": 7.385413634725132e-05, + "loss": 1.5654, + "num_input_tokens_seen": 2197684224, + "step": 1863 + }, + { + "epoch": 0.03114309307538044, + "grad_norm": 0.77734375, + "learning_rate": 7.384582346398509e-05, + "loss": 1.597, + "num_input_tokens_seen": 2198863872, + "step": 1864 + }, + { + "epoch": 0.031261059337029606, + "grad_norm": 0.765625, + "learning_rate": 7.383750543102308e-05, + "loss": 1.4255, + "num_input_tokens_seen": 2200043520, + "step": 1865 + }, + { + "epoch": 0.03137902559867878, + "grad_norm": 0.72265625, + "learning_rate": 7.382918224963087e-05, + "loss": 1.6604, + "num_input_tokens_seen": 2201223168, + "step": 1866 + }, + { + "epoch": 0.03149699186032794, + "grad_norm": 0.765625, + "learning_rate": 7.382085392107486e-05, + "loss": 1.6227, + "num_input_tokens_seen": 2202402816, + "step": 1867 + }, + { + "epoch": 0.031614958121977114, + "grad_norm": 0.94140625, + "learning_rate": 7.381252044662223e-05, + "loss": 1.5137, + "num_input_tokens_seen": 2203582464, + "step": 1868 + }, + { + "epoch": 0.031732924383626286, + "grad_norm": 0.71875, + "learning_rate": 7.380418182754093e-05, + "loss": 1.6039, + "num_input_tokens_seen": 2204762112, + "step": 1869 + }, + { + "epoch": 0.03185089064527545, + "grad_norm": 0.76953125, + "learning_rate": 7.379583806509967e-05, + "loss": 1.4421, + "num_input_tokens_seen": 2205941760, + "step": 1870 + }, + { + "epoch": 0.03196885690692462, + "grad_norm": 0.66796875, + "learning_rate": 7.378748916056798e-05, + "loss": 1.6332, + "num_input_tokens_seen": 2207121408, + "step": 1871 + }, + { + "epoch": 0.03208682316857379, + "grad_norm": 0.71484375, + "learning_rate": 7.377913511521617e-05, + "loss": 1.5797, + "num_input_tokens_seen": 2208301056, + "step": 1872 + }, + { + "epoch": 0.03220478943022296, + "grad_norm": 0.66796875, + "learning_rate": 7.377077593031531e-05, + "loss": 1.598, + "num_input_tokens_seen": 2209480704, + "step": 1873 + }, + { + "epoch": 0.032322755691872124, + "grad_norm": 0.73046875, + "learning_rate": 7.376241160713727e-05, + "loss": 1.5616, + "num_input_tokens_seen": 2210660352, + "step": 1874 + }, + { + "epoch": 0.032440721953521295, + "grad_norm": 0.765625, + "learning_rate": 7.37540421469547e-05, + "loss": 1.4614, + "num_input_tokens_seen": 2211840000, + "step": 1875 + }, + { + "epoch": 0.03255868821517046, + "grad_norm": 0.6796875, + "learning_rate": 7.374566755104098e-05, + "loss": 1.5495, + "num_input_tokens_seen": 2213019648, + "step": 1876 + }, + { + "epoch": 0.03267665447681963, + "grad_norm": 0.81640625, + "learning_rate": 7.373728782067038e-05, + "loss": 1.358, + "num_input_tokens_seen": 2214199296, + "step": 1877 + }, + { + "epoch": 0.0327946207384688, + "grad_norm": 0.79296875, + "learning_rate": 7.372890295711786e-05, + "loss": 1.4047, + "num_input_tokens_seen": 2215378944, + "step": 1878 + }, + { + "epoch": 0.03291258700011797, + "grad_norm": 0.7578125, + "learning_rate": 7.37205129616592e-05, + "loss": 1.4799, + "num_input_tokens_seen": 2216558592, + "step": 1879 + }, + { + "epoch": 0.03303055326176713, + "grad_norm": 0.71875, + "learning_rate": 7.371211783557095e-05, + "loss": 1.3531, + "num_input_tokens_seen": 2217738240, + "step": 1880 + }, + { + "epoch": 0.033148519523416305, + "grad_norm": 0.70703125, + "learning_rate": 7.370371758013042e-05, + "loss": 1.5988, + "num_input_tokens_seen": 2218917888, + "step": 1881 + }, + { + "epoch": 0.03326648578506547, + "grad_norm": 0.69140625, + "learning_rate": 7.369531219661575e-05, + "loss": 1.5523, + "num_input_tokens_seen": 2220097536, + "step": 1882 + }, + { + "epoch": 0.03338445204671464, + "grad_norm": 0.6484375, + "learning_rate": 7.368690168630582e-05, + "loss": 1.5946, + "num_input_tokens_seen": 2221277184, + "step": 1883 + }, + { + "epoch": 0.033502418308363806, + "grad_norm": 0.7109375, + "learning_rate": 7.367848605048031e-05, + "loss": 1.4925, + "num_input_tokens_seen": 2222456832, + "step": 1884 + }, + { + "epoch": 0.03362038457001298, + "grad_norm": 0.76171875, + "learning_rate": 7.367006529041967e-05, + "loss": 1.5686, + "num_input_tokens_seen": 2223636480, + "step": 1885 + }, + { + "epoch": 0.03373835083166214, + "grad_norm": 0.8359375, + "learning_rate": 7.366163940740511e-05, + "loss": 1.5127, + "num_input_tokens_seen": 2224816128, + "step": 1886 + }, + { + "epoch": 0.033856317093311314, + "grad_norm": 0.83203125, + "learning_rate": 7.365320840271867e-05, + "loss": 1.5116, + "num_input_tokens_seen": 2225995776, + "step": 1887 + }, + { + "epoch": 0.03397428335496048, + "grad_norm": 0.796875, + "learning_rate": 7.364477227764314e-05, + "loss": 1.486, + "num_input_tokens_seen": 2227175424, + "step": 1888 + }, + { + "epoch": 0.03409224961660965, + "grad_norm": 1.046875, + "learning_rate": 7.363633103346207e-05, + "loss": 1.6249, + "num_input_tokens_seen": 2228355072, + "step": 1889 + }, + { + "epoch": 0.034210215878258815, + "grad_norm": 0.8984375, + "learning_rate": 7.362788467145983e-05, + "loss": 1.6136, + "num_input_tokens_seen": 2229534720, + "step": 1890 + }, + { + "epoch": 0.03432818213990799, + "grad_norm": 0.84765625, + "learning_rate": 7.361943319292153e-05, + "loss": 1.5335, + "num_input_tokens_seen": 2230714368, + "step": 1891 + }, + { + "epoch": 0.03444614840155715, + "grad_norm": 0.7734375, + "learning_rate": 7.36109765991331e-05, + "loss": 1.4603, + "num_input_tokens_seen": 2231894016, + "step": 1892 + }, + { + "epoch": 0.03456411466320632, + "grad_norm": 0.83984375, + "learning_rate": 7.360251489138119e-05, + "loss": 1.4003, + "num_input_tokens_seen": 2233073664, + "step": 1893 + }, + { + "epoch": 0.03468208092485549, + "grad_norm": 0.73828125, + "learning_rate": 7.359404807095328e-05, + "loss": 1.5943, + "num_input_tokens_seen": 2234253312, + "step": 1894 + }, + { + "epoch": 0.03480004718650466, + "grad_norm": 0.8203125, + "learning_rate": 7.358557613913761e-05, + "loss": 1.5785, + "num_input_tokens_seen": 2235432960, + "step": 1895 + }, + { + "epoch": 0.03491801344815383, + "grad_norm": 0.8359375, + "learning_rate": 7.357709909722319e-05, + "loss": 1.3838, + "num_input_tokens_seen": 2236612608, + "step": 1896 + }, + { + "epoch": 0.035035979709802996, + "grad_norm": 0.69140625, + "learning_rate": 7.356861694649985e-05, + "loss": 1.5458, + "num_input_tokens_seen": 2237792256, + "step": 1897 + }, + { + "epoch": 0.03515394597145217, + "grad_norm": 0.73828125, + "learning_rate": 7.35601296882581e-05, + "loss": 1.5471, + "num_input_tokens_seen": 2238971904, + "step": 1898 + }, + { + "epoch": 0.03527191223310133, + "grad_norm": 0.8046875, + "learning_rate": 7.355163732378937e-05, + "loss": 1.4163, + "num_input_tokens_seen": 2240151552, + "step": 1899 + }, + { + "epoch": 0.035389878494750504, + "grad_norm": 0.703125, + "learning_rate": 7.35431398543857e-05, + "loss": 1.5865, + "num_input_tokens_seen": 2241331200, + "step": 1900 + }, + { + "epoch": 0.03550784475639967, + "grad_norm": 0.8203125, + "learning_rate": 7.353463728134005e-05, + "loss": 1.3711, + "num_input_tokens_seen": 2242510848, + "step": 1901 + }, + { + "epoch": 0.03562581101804884, + "grad_norm": 0.68359375, + "learning_rate": 7.352612960594609e-05, + "loss": 1.5282, + "num_input_tokens_seen": 2243690496, + "step": 1902 + }, + { + "epoch": 0.035743777279698005, + "grad_norm": 0.765625, + "learning_rate": 7.351761682949827e-05, + "loss": 1.6066, + "num_input_tokens_seen": 2244870144, + "step": 1903 + }, + { + "epoch": 0.03586174354134718, + "grad_norm": 0.76953125, + "learning_rate": 7.350909895329183e-05, + "loss": 1.5216, + "num_input_tokens_seen": 2246049792, + "step": 1904 + }, + { + "epoch": 0.03597970980299634, + "grad_norm": 0.7421875, + "learning_rate": 7.350057597862277e-05, + "loss": 1.5401, + "num_input_tokens_seen": 2247229440, + "step": 1905 + }, + { + "epoch": 0.03609767606464551, + "grad_norm": 0.77734375, + "learning_rate": 7.349204790678788e-05, + "loss": 1.4897, + "num_input_tokens_seen": 2248409088, + "step": 1906 + }, + { + "epoch": 0.03621564232629468, + "grad_norm": 0.8359375, + "learning_rate": 7.348351473908469e-05, + "loss": 1.5963, + "num_input_tokens_seen": 2249588736, + "step": 1907 + }, + { + "epoch": 0.03633360858794385, + "grad_norm": 0.765625, + "learning_rate": 7.34749764768116e-05, + "loss": 1.6018, + "num_input_tokens_seen": 2250768384, + "step": 1908 + }, + { + "epoch": 0.036451574849593014, + "grad_norm": 0.84765625, + "learning_rate": 7.346643312126766e-05, + "loss": 1.5299, + "num_input_tokens_seen": 2251948032, + "step": 1909 + }, + { + "epoch": 0.036569541111242186, + "grad_norm": 0.68359375, + "learning_rate": 7.345788467375278e-05, + "loss": 1.4308, + "num_input_tokens_seen": 2253127680, + "step": 1910 + }, + { + "epoch": 0.03668750737289135, + "grad_norm": 0.984375, + "learning_rate": 7.344933113556762e-05, + "loss": 1.6228, + "num_input_tokens_seen": 2254307328, + "step": 1911 + }, + { + "epoch": 0.03680547363454052, + "grad_norm": 0.78515625, + "learning_rate": 7.344077250801361e-05, + "loss": 1.5428, + "num_input_tokens_seen": 2255486976, + "step": 1912 + }, + { + "epoch": 0.03692343989618969, + "grad_norm": 0.8828125, + "learning_rate": 7.343220879239296e-05, + "loss": 1.4764, + "num_input_tokens_seen": 2256666624, + "step": 1913 + }, + { + "epoch": 0.03704140615783886, + "grad_norm": 0.9140625, + "learning_rate": 7.342363999000865e-05, + "loss": 1.4045, + "num_input_tokens_seen": 2257846272, + "step": 1914 + }, + { + "epoch": 0.037159372419488024, + "grad_norm": 0.87109375, + "learning_rate": 7.341506610216445e-05, + "loss": 1.4639, + "num_input_tokens_seen": 2259025920, + "step": 1915 + }, + { + "epoch": 0.037277338681137195, + "grad_norm": 0.8125, + "learning_rate": 7.340648713016487e-05, + "loss": 1.4876, + "num_input_tokens_seen": 2260205568, + "step": 1916 + }, + { + "epoch": 0.03739530494278636, + "grad_norm": 0.7890625, + "learning_rate": 7.339790307531523e-05, + "loss": 1.4313, + "num_input_tokens_seen": 2261385216, + "step": 1917 + }, + { + "epoch": 0.03751327120443553, + "grad_norm": 0.77734375, + "learning_rate": 7.33893139389216e-05, + "loss": 1.6842, + "num_input_tokens_seen": 2262564864, + "step": 1918 + }, + { + "epoch": 0.0376312374660847, + "grad_norm": 0.765625, + "learning_rate": 7.338071972229083e-05, + "loss": 1.3915, + "num_input_tokens_seen": 2263744512, + "step": 1919 + }, + { + "epoch": 0.03774920372773387, + "grad_norm": 0.765625, + "learning_rate": 7.337212042673055e-05, + "loss": 1.4723, + "num_input_tokens_seen": 2264924160, + "step": 1920 + }, + { + "epoch": 0.03786716998938304, + "grad_norm": 0.828125, + "learning_rate": 7.336351605354916e-05, + "loss": 1.3676, + "num_input_tokens_seen": 2266103808, + "step": 1921 + }, + { + "epoch": 0.037985136251032205, + "grad_norm": 0.66796875, + "learning_rate": 7.335490660405581e-05, + "loss": 1.512, + "num_input_tokens_seen": 2267283456, + "step": 1922 + }, + { + "epoch": 0.038103102512681376, + "grad_norm": 0.703125, + "learning_rate": 7.334629207956046e-05, + "loss": 1.4637, + "num_input_tokens_seen": 2268463104, + "step": 1923 + }, + { + "epoch": 0.03822106877433054, + "grad_norm": 0.66796875, + "learning_rate": 7.333767248137382e-05, + "loss": 1.5311, + "num_input_tokens_seen": 2269642752, + "step": 1924 + }, + { + "epoch": 0.03833903503597971, + "grad_norm": 0.6875, + "learning_rate": 7.332904781080736e-05, + "loss": 1.4918, + "num_input_tokens_seen": 2270822400, + "step": 1925 + }, + { + "epoch": 0.03845700129762888, + "grad_norm": 0.67578125, + "learning_rate": 7.332041806917337e-05, + "loss": 1.5913, + "num_input_tokens_seen": 2272002048, + "step": 1926 + }, + { + "epoch": 0.03857496755927805, + "grad_norm": 0.6953125, + "learning_rate": 7.331178325778485e-05, + "loss": 1.3829, + "num_input_tokens_seen": 2273181696, + "step": 1927 + }, + { + "epoch": 0.038692933820927214, + "grad_norm": 0.6875, + "learning_rate": 7.330314337795561e-05, + "loss": 1.4821, + "num_input_tokens_seen": 2274361344, + "step": 1928 + }, + { + "epoch": 0.038810900082576386, + "grad_norm": 0.7421875, + "learning_rate": 7.329449843100022e-05, + "loss": 1.5275, + "num_input_tokens_seen": 2275540992, + "step": 1929 + }, + { + "epoch": 0.03892886634422555, + "grad_norm": 0.63671875, + "learning_rate": 7.328584841823401e-05, + "loss": 1.634, + "num_input_tokens_seen": 2276720640, + "step": 1930 + }, + { + "epoch": 0.03904683260587472, + "grad_norm": 0.7109375, + "learning_rate": 7.327719334097312e-05, + "loss": 1.6776, + "num_input_tokens_seen": 2277900288, + "step": 1931 + }, + { + "epoch": 0.03916479886752389, + "grad_norm": 0.70703125, + "learning_rate": 7.326853320053442e-05, + "loss": 1.4269, + "num_input_tokens_seen": 2279079936, + "step": 1932 + }, + { + "epoch": 0.03928276512917306, + "grad_norm": 0.66015625, + "learning_rate": 7.325986799823555e-05, + "loss": 1.4935, + "num_input_tokens_seen": 2280259584, + "step": 1933 + }, + { + "epoch": 0.03940073139082222, + "grad_norm": 0.66015625, + "learning_rate": 7.325119773539497e-05, + "loss": 1.5445, + "num_input_tokens_seen": 2281439232, + "step": 1934 + }, + { + "epoch": 0.039518697652471395, + "grad_norm": 0.71484375, + "learning_rate": 7.324252241333185e-05, + "loss": 1.4365, + "num_input_tokens_seen": 2282618880, + "step": 1935 + }, + { + "epoch": 0.03963666391412056, + "grad_norm": 0.6484375, + "learning_rate": 7.323384203336615e-05, + "loss": 1.5626, + "num_input_tokens_seen": 2283798528, + "step": 1936 + }, + { + "epoch": 0.03975463017576973, + "grad_norm": 0.75390625, + "learning_rate": 7.32251565968186e-05, + "loss": 1.4364, + "num_input_tokens_seen": 2284978176, + "step": 1937 + }, + { + "epoch": 0.039872596437418896, + "grad_norm": 0.65234375, + "learning_rate": 7.321646610501073e-05, + "loss": 1.5292, + "num_input_tokens_seen": 2286157824, + "step": 1938 + }, + { + "epoch": 0.03999056269906807, + "grad_norm": 0.71484375, + "learning_rate": 7.320777055926478e-05, + "loss": 1.3565, + "num_input_tokens_seen": 2287337472, + "step": 1939 + }, + { + "epoch": 0.04010852896071723, + "grad_norm": 0.69921875, + "learning_rate": 7.319906996090383e-05, + "loss": 1.5397, + "num_input_tokens_seen": 2288517120, + "step": 1940 + }, + { + "epoch": 0.040226495222366404, + "grad_norm": 0.6640625, + "learning_rate": 7.319036431125165e-05, + "loss": 1.4882, + "num_input_tokens_seen": 2289696768, + "step": 1941 + }, + { + "epoch": 0.04034446148401557, + "grad_norm": 0.67578125, + "learning_rate": 7.318165361163284e-05, + "loss": 1.6145, + "num_input_tokens_seen": 2290876416, + "step": 1942 + }, + { + "epoch": 0.04046242774566474, + "grad_norm": 0.6484375, + "learning_rate": 7.317293786337273e-05, + "loss": 1.5312, + "num_input_tokens_seen": 2292056064, + "step": 1943 + }, + { + "epoch": 0.040580394007313905, + "grad_norm": 0.76171875, + "learning_rate": 7.316421706779746e-05, + "loss": 1.5138, + "num_input_tokens_seen": 2293235712, + "step": 1944 + }, + { + "epoch": 0.04069836026896308, + "grad_norm": 0.703125, + "learning_rate": 7.31554912262339e-05, + "loss": 1.4698, + "num_input_tokens_seen": 2294415360, + "step": 1945 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 0.65234375, + "learning_rate": 7.314676034000968e-05, + "loss": 1.4988, + "num_input_tokens_seen": 2295595008, + "step": 1946 + }, + { + "epoch": 0.04093429279226141, + "grad_norm": 0.65234375, + "learning_rate": 7.313802441045326e-05, + "loss": 1.5906, + "num_input_tokens_seen": 2296774656, + "step": 1947 + }, + { + "epoch": 0.041052259053910585, + "grad_norm": 0.67578125, + "learning_rate": 7.31292834388938e-05, + "loss": 1.7172, + "num_input_tokens_seen": 2297954304, + "step": 1948 + }, + { + "epoch": 0.04117022531555975, + "grad_norm": 0.66796875, + "learning_rate": 7.312053742666126e-05, + "loss": 1.5577, + "num_input_tokens_seen": 2299133952, + "step": 1949 + }, + { + "epoch": 0.04128819157720892, + "grad_norm": 0.6953125, + "learning_rate": 7.311178637508634e-05, + "loss": 1.5534, + "num_input_tokens_seen": 2300313600, + "step": 1950 + }, + { + "epoch": 0.041406157838858086, + "grad_norm": 0.62109375, + "learning_rate": 7.310303028550056e-05, + "loss": 1.4872, + "num_input_tokens_seen": 2301493248, + "step": 1951 + }, + { + "epoch": 0.04152412410050726, + "grad_norm": 0.73046875, + "learning_rate": 7.309426915923615e-05, + "loss": 1.4101, + "num_input_tokens_seen": 2302672896, + "step": 1952 + }, + { + "epoch": 0.04164209036215642, + "grad_norm": 0.69140625, + "learning_rate": 7.308550299762613e-05, + "loss": 1.609, + "num_input_tokens_seen": 2303852544, + "step": 1953 + }, + { + "epoch": 0.041760056623805594, + "grad_norm": 0.63671875, + "learning_rate": 7.307673180200429e-05, + "loss": 1.6246, + "num_input_tokens_seen": 2305032192, + "step": 1954 + }, + { + "epoch": 0.04187802288545476, + "grad_norm": 0.74609375, + "learning_rate": 7.306795557370519e-05, + "loss": 1.5308, + "num_input_tokens_seen": 2306211840, + "step": 1955 + }, + { + "epoch": 0.04199598914710393, + "grad_norm": 0.68359375, + "learning_rate": 7.305917431406413e-05, + "loss": 1.5942, + "num_input_tokens_seen": 2307391488, + "step": 1956 + }, + { + "epoch": 0.042113955408753095, + "grad_norm": 0.7734375, + "learning_rate": 7.30503880244172e-05, + "loss": 1.6072, + "num_input_tokens_seen": 2308571136, + "step": 1957 + }, + { + "epoch": 0.04223192167040227, + "grad_norm": 0.734375, + "learning_rate": 7.304159670610126e-05, + "loss": 1.3998, + "num_input_tokens_seen": 2309750784, + "step": 1958 + }, + { + "epoch": 0.04234988793205143, + "grad_norm": 0.71875, + "learning_rate": 7.30328003604539e-05, + "loss": 1.5292, + "num_input_tokens_seen": 2310930432, + "step": 1959 + }, + { + "epoch": 0.042467854193700603, + "grad_norm": 0.6875, + "learning_rate": 7.302399898881352e-05, + "loss": 1.4782, + "num_input_tokens_seen": 2312110080, + "step": 1960 + }, + { + "epoch": 0.04258582045534977, + "grad_norm": 0.703125, + "learning_rate": 7.301519259251925e-05, + "loss": 1.3287, + "num_input_tokens_seen": 2313289728, + "step": 1961 + }, + { + "epoch": 0.04270378671699894, + "grad_norm": 0.734375, + "learning_rate": 7.3006381172911e-05, + "loss": 1.5527, + "num_input_tokens_seen": 2314469376, + "step": 1962 + }, + { + "epoch": 0.042821752978648105, + "grad_norm": 0.734375, + "learning_rate": 7.299756473132944e-05, + "loss": 1.5148, + "num_input_tokens_seen": 2315649024, + "step": 1963 + }, + { + "epoch": 0.042939719240297276, + "grad_norm": 0.6640625, + "learning_rate": 7.2988743269116e-05, + "loss": 1.541, + "num_input_tokens_seen": 2316828672, + "step": 1964 + }, + { + "epoch": 0.04305768550194644, + "grad_norm": 0.6875, + "learning_rate": 7.297991678761289e-05, + "loss": 1.4709, + "num_input_tokens_seen": 2318008320, + "step": 1965 + }, + { + "epoch": 0.04317565176359561, + "grad_norm": 0.7421875, + "learning_rate": 7.297108528816308e-05, + "loss": 1.4995, + "num_input_tokens_seen": 2319187968, + "step": 1966 + }, + { + "epoch": 0.04329361802524478, + "grad_norm": 0.67578125, + "learning_rate": 7.296224877211029e-05, + "loss": 1.4244, + "num_input_tokens_seen": 2320367616, + "step": 1967 + }, + { + "epoch": 0.04341158428689395, + "grad_norm": 0.9296875, + "learning_rate": 7.295340724079899e-05, + "loss": 1.5493, + "num_input_tokens_seen": 2321547264, + "step": 1968 + }, + { + "epoch": 0.043529550548543114, + "grad_norm": 0.74609375, + "learning_rate": 7.294456069557445e-05, + "loss": 1.5873, + "num_input_tokens_seen": 2322726912, + "step": 1969 + }, + { + "epoch": 0.043647516810192286, + "grad_norm": 0.75, + "learning_rate": 7.293570913778268e-05, + "loss": 1.3775, + "num_input_tokens_seen": 2323906560, + "step": 1970 + }, + { + "epoch": 0.04376548307184145, + "grad_norm": 0.75, + "learning_rate": 7.292685256877049e-05, + "loss": 1.443, + "num_input_tokens_seen": 2325086208, + "step": 1971 + }, + { + "epoch": 0.04388344933349062, + "grad_norm": 0.703125, + "learning_rate": 7.291799098988539e-05, + "loss": 1.495, + "num_input_tokens_seen": 2326265856, + "step": 1972 + }, + { + "epoch": 0.04400141559513979, + "grad_norm": 0.65625, + "learning_rate": 7.290912440247567e-05, + "loss": 1.5695, + "num_input_tokens_seen": 2327445504, + "step": 1973 + }, + { + "epoch": 0.04411938185678896, + "grad_norm": 0.70703125, + "learning_rate": 7.290025280789042e-05, + "loss": 1.5184, + "num_input_tokens_seen": 2328625152, + "step": 1974 + }, + { + "epoch": 0.04423734811843813, + "grad_norm": 0.6953125, + "learning_rate": 7.289137620747947e-05, + "loss": 1.4567, + "num_input_tokens_seen": 2329804800, + "step": 1975 + }, + { + "epoch": 0.044355314380087295, + "grad_norm": 0.625, + "learning_rate": 7.288249460259338e-05, + "loss": 1.5241, + "num_input_tokens_seen": 2330984448, + "step": 1976 + }, + { + "epoch": 0.044473280641736467, + "grad_norm": 0.73828125, + "learning_rate": 7.287360799458354e-05, + "loss": 1.4828, + "num_input_tokens_seen": 2332164096, + "step": 1977 + }, + { + "epoch": 0.04459124690338563, + "grad_norm": 0.6640625, + "learning_rate": 7.286471638480204e-05, + "loss": 1.6184, + "num_input_tokens_seen": 2333343744, + "step": 1978 + }, + { + "epoch": 0.0447092131650348, + "grad_norm": 0.73828125, + "learning_rate": 7.285581977460174e-05, + "loss": 1.3618, + "num_input_tokens_seen": 2334523392, + "step": 1979 + }, + { + "epoch": 0.04482717942668397, + "grad_norm": 0.73046875, + "learning_rate": 7.28469181653363e-05, + "loss": 1.3807, + "num_input_tokens_seen": 2335703040, + "step": 1980 + }, + { + "epoch": 0.04494514568833314, + "grad_norm": 0.72265625, + "learning_rate": 7.283801155836009e-05, + "loss": 1.5257, + "num_input_tokens_seen": 2336882688, + "step": 1981 + }, + { + "epoch": 0.045063111949982304, + "grad_norm": 0.69140625, + "learning_rate": 7.282909995502828e-05, + "loss": 1.5478, + "num_input_tokens_seen": 2338062336, + "step": 1982 + }, + { + "epoch": 0.045181078211631476, + "grad_norm": 0.703125, + "learning_rate": 7.282018335669678e-05, + "loss": 1.4212, + "num_input_tokens_seen": 2339241984, + "step": 1983 + }, + { + "epoch": 0.04529904447328064, + "grad_norm": 0.82421875, + "learning_rate": 7.281126176472226e-05, + "loss": 1.4443, + "num_input_tokens_seen": 2340421632, + "step": 1984 + }, + { + "epoch": 0.04541701073492981, + "grad_norm": 0.640625, + "learning_rate": 7.280233518046217e-05, + "loss": 1.544, + "num_input_tokens_seen": 2341601280, + "step": 1985 + }, + { + "epoch": 0.04553497699657898, + "grad_norm": 0.796875, + "learning_rate": 7.27934036052747e-05, + "loss": 1.498, + "num_input_tokens_seen": 2342780928, + "step": 1986 + }, + { + "epoch": 0.04565294325822815, + "grad_norm": 0.69140625, + "learning_rate": 7.278446704051878e-05, + "loss": 1.6416, + "num_input_tokens_seen": 2343960576, + "step": 1987 + }, + { + "epoch": 0.04577090951987731, + "grad_norm": 0.76171875, + "learning_rate": 7.277552548755414e-05, + "loss": 1.4801, + "num_input_tokens_seen": 2345140224, + "step": 1988 + }, + { + "epoch": 0.045888875781526485, + "grad_norm": 0.703125, + "learning_rate": 7.276657894774126e-05, + "loss": 1.4833, + "num_input_tokens_seen": 2346319872, + "step": 1989 + }, + { + "epoch": 0.04600684204317565, + "grad_norm": 0.73046875, + "learning_rate": 7.275762742244135e-05, + "loss": 1.5809, + "num_input_tokens_seen": 2347499520, + "step": 1990 + }, + { + "epoch": 0.04612480830482482, + "grad_norm": 0.73828125, + "learning_rate": 7.274867091301642e-05, + "loss": 1.553, + "num_input_tokens_seen": 2348679168, + "step": 1991 + }, + { + "epoch": 0.046242774566473986, + "grad_norm": 0.890625, + "learning_rate": 7.27397094208292e-05, + "loss": 1.3766, + "num_input_tokens_seen": 2349858816, + "step": 1992 + }, + { + "epoch": 0.04636074082812316, + "grad_norm": 0.75390625, + "learning_rate": 7.27307429472432e-05, + "loss": 1.5416, + "num_input_tokens_seen": 2351038464, + "step": 1993 + }, + { + "epoch": 0.04647870708977232, + "grad_norm": 0.8828125, + "learning_rate": 7.27217714936227e-05, + "loss": 1.4611, + "num_input_tokens_seen": 2352218112, + "step": 1994 + }, + { + "epoch": 0.046596673351421494, + "grad_norm": 0.71875, + "learning_rate": 7.271279506133269e-05, + "loss": 1.4342, + "num_input_tokens_seen": 2353397760, + "step": 1995 + }, + { + "epoch": 0.04671463961307066, + "grad_norm": 0.84765625, + "learning_rate": 7.270381365173897e-05, + "loss": 1.5395, + "num_input_tokens_seen": 2354577408, + "step": 1996 + }, + { + "epoch": 0.04683260587471983, + "grad_norm": 0.7265625, + "learning_rate": 7.269482726620807e-05, + "loss": 1.5751, + "num_input_tokens_seen": 2355757056, + "step": 1997 + }, + { + "epoch": 0.046950572136368995, + "grad_norm": 0.75, + "learning_rate": 7.268583590610729e-05, + "loss": 1.5388, + "num_input_tokens_seen": 2356936704, + "step": 1998 + }, + { + "epoch": 0.04706853839801817, + "grad_norm": 0.7421875, + "learning_rate": 7.267683957280466e-05, + "loss": 1.4628, + "num_input_tokens_seen": 2358116352, + "step": 1999 + }, + { + "epoch": 0.04718650465966733, + "grad_norm": 0.6875, + "learning_rate": 7.266783826766901e-05, + "loss": 1.6311, + "num_input_tokens_seen": 2359296000, + "step": 2000 + }, + { + "epoch": 0.04718650465966733, + "eval_wikipedia_loss": 2.2608883380889893, + "eval_wikipedia_runtime": 163.5557, + "eval_wikipedia_samples_per_second": 4.292, + "eval_wikipedia_steps_per_second": 0.183, + "num_input_tokens_seen": 2359296000, + "step": 2000 + }, + { + "epoch": 0.04718650465966733, + "eval_toxicity_loss": 4.0160722732543945, + "eval_toxicity_runtime": 1.1558, + "eval_toxicity_samples_per_second": 1.73, + "eval_toxicity_steps_per_second": 0.865, + "num_input_tokens_seen": 2359296000, + "step": 2000 + }, + { + "epoch": 0.047304470921316503, + "grad_norm": 0.74609375, + "learning_rate": 7.265883199206989e-05, + "loss": 1.4713, + "num_input_tokens_seen": 2360475648, + "step": 2001 + }, + { + "epoch": 0.047422437182965675, + "grad_norm": 0.6953125, + "learning_rate": 7.264982074737762e-05, + "loss": 1.4955, + "num_input_tokens_seen": 2361655296, + "step": 2002 + }, + { + "epoch": 0.04754040344461484, + "grad_norm": 0.67578125, + "learning_rate": 7.264080453496328e-05, + "loss": 1.5753, + "num_input_tokens_seen": 2362834944, + "step": 2003 + }, + { + "epoch": 0.04765836970626401, + "grad_norm": 0.6328125, + "learning_rate": 7.263178335619868e-05, + "loss": 1.6932, + "num_input_tokens_seen": 2364014592, + "step": 2004 + }, + { + "epoch": 0.047776335967913176, + "grad_norm": 0.7109375, + "learning_rate": 7.262275721245644e-05, + "loss": 1.6407, + "num_input_tokens_seen": 2365194240, + "step": 2005 + }, + { + "epoch": 0.04789430222956235, + "grad_norm": 0.59765625, + "learning_rate": 7.26137261051099e-05, + "loss": 1.698, + "num_input_tokens_seen": 2366373888, + "step": 2006 + }, + { + "epoch": 0.04801226849121151, + "grad_norm": 0.76953125, + "learning_rate": 7.260469003553312e-05, + "loss": 1.4249, + "num_input_tokens_seen": 2367553536, + "step": 2007 + }, + { + "epoch": 0.048130234752860684, + "grad_norm": 0.61328125, + "learning_rate": 7.259564900510098e-05, + "loss": 1.7002, + "num_input_tokens_seen": 2368733184, + "step": 2008 + }, + { + "epoch": 0.04824820101450985, + "grad_norm": 0.6796875, + "learning_rate": 7.258660301518909e-05, + "loss": 1.7053, + "num_input_tokens_seen": 2369912832, + "step": 2009 + }, + { + "epoch": 0.04836616727615902, + "grad_norm": 0.6875, + "learning_rate": 7.257755206717379e-05, + "loss": 1.4856, + "num_input_tokens_seen": 2371092480, + "step": 2010 + }, + { + "epoch": 0.048484133537808186, + "grad_norm": 0.72265625, + "learning_rate": 7.256849616243223e-05, + "loss": 1.4462, + "num_input_tokens_seen": 2372272128, + "step": 2011 + }, + { + "epoch": 0.04860209979945736, + "grad_norm": 0.70703125, + "learning_rate": 7.255943530234225e-05, + "loss": 1.4224, + "num_input_tokens_seen": 2373451776, + "step": 2012 + }, + { + "epoch": 0.04872006606110652, + "grad_norm": 0.63671875, + "learning_rate": 7.255036948828249e-05, + "loss": 1.6058, + "num_input_tokens_seen": 2374631424, + "step": 2013 + }, + { + "epoch": 0.048838032322755694, + "grad_norm": 0.69921875, + "learning_rate": 7.254129872163232e-05, + "loss": 1.4915, + "num_input_tokens_seen": 2375811072, + "step": 2014 + }, + { + "epoch": 0.04895599858440486, + "grad_norm": 0.73828125, + "learning_rate": 7.253222300377188e-05, + "loss": 1.5527, + "num_input_tokens_seen": 2376990720, + "step": 2015 + }, + { + "epoch": 0.04907396484605403, + "grad_norm": 0.640625, + "learning_rate": 7.252314233608204e-05, + "loss": 1.5796, + "num_input_tokens_seen": 2378170368, + "step": 2016 + }, + { + "epoch": 0.049191931107703195, + "grad_norm": 0.6875, + "learning_rate": 7.251405671994446e-05, + "loss": 1.6129, + "num_input_tokens_seen": 2379350016, + "step": 2017 + }, + { + "epoch": 0.049309897369352367, + "grad_norm": 0.6796875, + "learning_rate": 7.250496615674152e-05, + "loss": 1.5305, + "num_input_tokens_seen": 2380529664, + "step": 2018 + }, + { + "epoch": 0.04942786363100153, + "grad_norm": 0.6328125, + "learning_rate": 7.249587064785636e-05, + "loss": 1.5511, + "num_input_tokens_seen": 2381709312, + "step": 2019 + }, + { + "epoch": 0.0495458298926507, + "grad_norm": 0.69140625, + "learning_rate": 7.248677019467286e-05, + "loss": 1.5779, + "num_input_tokens_seen": 2382888960, + "step": 2020 + }, + { + "epoch": 0.04966379615429987, + "grad_norm": 0.65625, + "learning_rate": 7.24776647985757e-05, + "loss": 1.5857, + "num_input_tokens_seen": 2384068608, + "step": 2021 + }, + { + "epoch": 0.04978176241594904, + "grad_norm": 0.66796875, + "learning_rate": 7.246855446095028e-05, + "loss": 1.496, + "num_input_tokens_seen": 2385248256, + "step": 2022 + }, + { + "epoch": 0.049899728677598204, + "grad_norm": 0.6171875, + "learning_rate": 7.245943918318272e-05, + "loss": 1.49, + "num_input_tokens_seen": 2386427904, + "step": 2023 + }, + { + "epoch": 0.050017694939247376, + "grad_norm": 0.65625, + "learning_rate": 7.245031896665995e-05, + "loss": 1.6214, + "num_input_tokens_seen": 2387607552, + "step": 2024 + }, + { + "epoch": 0.05013566120089654, + "grad_norm": 0.6015625, + "learning_rate": 7.244119381276963e-05, + "loss": 1.6914, + "num_input_tokens_seen": 2388787200, + "step": 2025 + }, + { + "epoch": 0.05025362746254571, + "grad_norm": 0.66796875, + "learning_rate": 7.243206372290014e-05, + "loss": 1.5241, + "num_input_tokens_seen": 2389966848, + "step": 2026 + }, + { + "epoch": 0.05037159372419488, + "grad_norm": 0.6796875, + "learning_rate": 7.242292869844067e-05, + "loss": 1.6441, + "num_input_tokens_seen": 2391146496, + "step": 2027 + }, + { + "epoch": 0.05048955998584405, + "grad_norm": 0.6875, + "learning_rate": 7.24137887407811e-05, + "loss": 1.6763, + "num_input_tokens_seen": 2392326144, + "step": 2028 + }, + { + "epoch": 0.05060752624749322, + "grad_norm": 0.66796875, + "learning_rate": 7.24046438513121e-05, + "loss": 1.4772, + "num_input_tokens_seen": 2393505792, + "step": 2029 + }, + { + "epoch": 0.050725492509142385, + "grad_norm": 0.66796875, + "learning_rate": 7.239549403142509e-05, + "loss": 1.5272, + "num_input_tokens_seen": 2394685440, + "step": 2030 + }, + { + "epoch": 0.05084345877079156, + "grad_norm": 0.7265625, + "learning_rate": 7.238633928251221e-05, + "loss": 1.3924, + "num_input_tokens_seen": 2395865088, + "step": 2031 + }, + { + "epoch": 0.05096142503244072, + "grad_norm": 0.6015625, + "learning_rate": 7.237717960596639e-05, + "loss": 1.6087, + "num_input_tokens_seen": 2397044736, + "step": 2032 + }, + { + "epoch": 0.05107939129408989, + "grad_norm": 0.71875, + "learning_rate": 7.236801500318127e-05, + "loss": 1.5425, + "num_input_tokens_seen": 2398224384, + "step": 2033 + }, + { + "epoch": 0.05119735755573906, + "grad_norm": 0.68359375, + "learning_rate": 7.235884547555127e-05, + "loss": 1.5686, + "num_input_tokens_seen": 2399404032, + "step": 2034 + }, + { + "epoch": 0.05131532381738823, + "grad_norm": 0.6328125, + "learning_rate": 7.234967102447155e-05, + "loss": 1.6643, + "num_input_tokens_seen": 2400583680, + "step": 2035 + }, + { + "epoch": 0.051433290079037394, + "grad_norm": 0.6875, + "learning_rate": 7.234049165133801e-05, + "loss": 1.5434, + "num_input_tokens_seen": 2401763328, + "step": 2036 + }, + { + "epoch": 0.051551256340686566, + "grad_norm": 0.70703125, + "learning_rate": 7.23313073575473e-05, + "loss": 1.5236, + "num_input_tokens_seen": 2402942976, + "step": 2037 + }, + { + "epoch": 0.05166922260233573, + "grad_norm": 0.625, + "learning_rate": 7.232211814449686e-05, + "loss": 1.59, + "num_input_tokens_seen": 2404122624, + "step": 2038 + }, + { + "epoch": 0.0517871888639849, + "grad_norm": 0.625, + "learning_rate": 7.23129240135848e-05, + "loss": 1.5156, + "num_input_tokens_seen": 2405302272, + "step": 2039 + }, + { + "epoch": 0.05190515512563407, + "grad_norm": 0.62109375, + "learning_rate": 7.230372496621003e-05, + "loss": 1.6894, + "num_input_tokens_seen": 2406481920, + "step": 2040 + }, + { + "epoch": 0.05202312138728324, + "grad_norm": 0.66015625, + "learning_rate": 7.229452100377223e-05, + "loss": 1.5584, + "num_input_tokens_seen": 2407661568, + "step": 2041 + }, + { + "epoch": 0.052141087648932403, + "grad_norm": 0.70703125, + "learning_rate": 7.228531212767178e-05, + "loss": 1.5068, + "num_input_tokens_seen": 2408841216, + "step": 2042 + }, + { + "epoch": 0.052259053910581575, + "grad_norm": 0.67578125, + "learning_rate": 7.22760983393098e-05, + "loss": 1.595, + "num_input_tokens_seen": 2410020864, + "step": 2043 + }, + { + "epoch": 0.05237702017223074, + "grad_norm": 0.64453125, + "learning_rate": 7.226687964008822e-05, + "loss": 1.4293, + "num_input_tokens_seen": 2411200512, + "step": 2044 + }, + { + "epoch": 0.05249498643387991, + "grad_norm": 0.65625, + "learning_rate": 7.225765603140964e-05, + "loss": 1.4935, + "num_input_tokens_seen": 2412380160, + "step": 2045 + }, + { + "epoch": 0.052612952695529076, + "grad_norm": 0.640625, + "learning_rate": 7.22484275146775e-05, + "loss": 1.453, + "num_input_tokens_seen": 2413559808, + "step": 2046 + }, + { + "epoch": 0.05273091895717825, + "grad_norm": 0.625, + "learning_rate": 7.223919409129589e-05, + "loss": 1.5559, + "num_input_tokens_seen": 2414739456, + "step": 2047 + }, + { + "epoch": 0.05284888521882741, + "grad_norm": 0.640625, + "learning_rate": 7.222995576266971e-05, + "loss": 1.4826, + "num_input_tokens_seen": 2415919104, + "step": 2048 + }, + { + "epoch": 0.052966851480476584, + "grad_norm": 0.70703125, + "learning_rate": 7.222071253020457e-05, + "loss": 1.5566, + "num_input_tokens_seen": 2417098752, + "step": 2049 + }, + { + "epoch": 0.05308481774212575, + "grad_norm": 0.609375, + "learning_rate": 7.221146439530687e-05, + "loss": 1.6199, + "num_input_tokens_seen": 2418278400, + "step": 2050 + }, + { + "epoch": 0.05320278400377492, + "grad_norm": 0.625, + "learning_rate": 7.220221135938369e-05, + "loss": 1.5922, + "num_input_tokens_seen": 2419458048, + "step": 2051 + }, + { + "epoch": 0.053320750265424086, + "grad_norm": 0.59375, + "learning_rate": 7.219295342384293e-05, + "loss": 1.5259, + "num_input_tokens_seen": 2420637696, + "step": 2052 + }, + { + "epoch": 0.05343871652707326, + "grad_norm": 0.58984375, + "learning_rate": 7.218369059009319e-05, + "loss": 1.6072, + "num_input_tokens_seen": 2421817344, + "step": 2053 + }, + { + "epoch": 0.05355668278872242, + "grad_norm": 0.671875, + "learning_rate": 7.217442285954381e-05, + "loss": 1.6169, + "num_input_tokens_seen": 2422996992, + "step": 2054 + }, + { + "epoch": 0.053674649050371594, + "grad_norm": 0.62109375, + "learning_rate": 7.21651502336049e-05, + "loss": 1.5768, + "num_input_tokens_seen": 2424176640, + "step": 2055 + }, + { + "epoch": 0.053792615312020765, + "grad_norm": 0.609375, + "learning_rate": 7.21558727136873e-05, + "loss": 1.5223, + "num_input_tokens_seen": 2425356288, + "step": 2056 + }, + { + "epoch": 0.05391058157366993, + "grad_norm": 0.59765625, + "learning_rate": 7.214659030120262e-05, + "loss": 1.5783, + "num_input_tokens_seen": 2426535936, + "step": 2057 + }, + { + "epoch": 0.0540285478353191, + "grad_norm": 0.71875, + "learning_rate": 7.213730299756317e-05, + "loss": 1.4265, + "num_input_tokens_seen": 2427715584, + "step": 2058 + }, + { + "epoch": 0.054146514096968267, + "grad_norm": 0.6484375, + "learning_rate": 7.212801080418204e-05, + "loss": 1.5677, + "num_input_tokens_seen": 2428895232, + "step": 2059 + }, + { + "epoch": 0.05426448035861744, + "grad_norm": 0.6171875, + "learning_rate": 7.211871372247304e-05, + "loss": 1.5219, + "num_input_tokens_seen": 2430074880, + "step": 2060 + }, + { + "epoch": 0.0543824466202666, + "grad_norm": 0.66796875, + "learning_rate": 7.210941175385075e-05, + "loss": 1.5097, + "num_input_tokens_seen": 2431254528, + "step": 2061 + }, + { + "epoch": 0.054500412881915775, + "grad_norm": 0.66015625, + "learning_rate": 7.210010489973047e-05, + "loss": 1.7001, + "num_input_tokens_seen": 2432434176, + "step": 2062 + }, + { + "epoch": 0.05461837914356494, + "grad_norm": 0.640625, + "learning_rate": 7.209079316152826e-05, + "loss": 1.5308, + "num_input_tokens_seen": 2433613824, + "step": 2063 + }, + { + "epoch": 0.05473634540521411, + "grad_norm": 0.63671875, + "learning_rate": 7.208147654066091e-05, + "loss": 1.5989, + "num_input_tokens_seen": 2434793472, + "step": 2064 + }, + { + "epoch": 0.054854311666863276, + "grad_norm": 0.6328125, + "learning_rate": 7.207215503854596e-05, + "loss": 1.7482, + "num_input_tokens_seen": 2435973120, + "step": 2065 + }, + { + "epoch": 0.05497227792851245, + "grad_norm": 0.6875, + "learning_rate": 7.206282865660169e-05, + "loss": 1.6143, + "num_input_tokens_seen": 2437152768, + "step": 2066 + }, + { + "epoch": 0.05509024419016161, + "grad_norm": 0.6484375, + "learning_rate": 7.205349739624711e-05, + "loss": 1.6127, + "num_input_tokens_seen": 2438332416, + "step": 2067 + }, + { + "epoch": 0.055208210451810784, + "grad_norm": 0.64453125, + "learning_rate": 7.204416125890203e-05, + "loss": 1.4988, + "num_input_tokens_seen": 2439512064, + "step": 2068 + }, + { + "epoch": 0.05532617671345995, + "grad_norm": 0.61328125, + "learning_rate": 7.20348202459869e-05, + "loss": 1.5739, + "num_input_tokens_seen": 2440691712, + "step": 2069 + }, + { + "epoch": 0.05544414297510912, + "grad_norm": 0.65234375, + "learning_rate": 7.202547435892302e-05, + "loss": 1.5308, + "num_input_tokens_seen": 2441871360, + "step": 2070 + }, + { + "epoch": 0.055562109236758285, + "grad_norm": 0.625, + "learning_rate": 7.201612359913235e-05, + "loss": 1.5948, + "num_input_tokens_seen": 2443051008, + "step": 2071 + }, + { + "epoch": 0.05568007549840746, + "grad_norm": 0.67578125, + "learning_rate": 7.200676796803765e-05, + "loss": 1.5753, + "num_input_tokens_seen": 2444230656, + "step": 2072 + }, + { + "epoch": 0.05579804176005662, + "grad_norm": 0.6484375, + "learning_rate": 7.199740746706235e-05, + "loss": 1.5194, + "num_input_tokens_seen": 2445410304, + "step": 2073 + }, + { + "epoch": 0.05591600802170579, + "grad_norm": 0.68359375, + "learning_rate": 7.198804209763071e-05, + "loss": 1.4933, + "num_input_tokens_seen": 2446589952, + "step": 2074 + }, + { + "epoch": 0.05603397428335496, + "grad_norm": 0.65234375, + "learning_rate": 7.197867186116767e-05, + "loss": 1.5015, + "num_input_tokens_seen": 2447769600, + "step": 2075 + }, + { + "epoch": 0.05615194054500413, + "grad_norm": 0.63671875, + "learning_rate": 7.196929675909893e-05, + "loss": 1.6335, + "num_input_tokens_seen": 2448949248, + "step": 2076 + }, + { + "epoch": 0.056269906806653294, + "grad_norm": 0.6171875, + "learning_rate": 7.195991679285091e-05, + "loss": 1.5782, + "num_input_tokens_seen": 2450128896, + "step": 2077 + }, + { + "epoch": 0.056387873068302466, + "grad_norm": 0.703125, + "learning_rate": 7.19505319638508e-05, + "loss": 1.4924, + "num_input_tokens_seen": 2451308544, + "step": 2078 + }, + { + "epoch": 0.05650583932995163, + "grad_norm": 0.69921875, + "learning_rate": 7.194114227352653e-05, + "loss": 1.5547, + "num_input_tokens_seen": 2452488192, + "step": 2079 + }, + { + "epoch": 0.0566238055916008, + "grad_norm": 0.61328125, + "learning_rate": 7.193174772330673e-05, + "loss": 1.4579, + "num_input_tokens_seen": 2453667840, + "step": 2080 + }, + { + "epoch": 0.05674177185324997, + "grad_norm": 0.7421875, + "learning_rate": 7.19223483146208e-05, + "loss": 1.4006, + "num_input_tokens_seen": 2454847488, + "step": 2081 + }, + { + "epoch": 0.05685973811489914, + "grad_norm": 0.6875, + "learning_rate": 7.191294404889891e-05, + "loss": 1.4052, + "num_input_tokens_seen": 2456027136, + "step": 2082 + }, + { + "epoch": 0.05697770437654831, + "grad_norm": 0.6640625, + "learning_rate": 7.190353492757186e-05, + "loss": 1.5845, + "num_input_tokens_seen": 2457206784, + "step": 2083 + }, + { + "epoch": 0.057095670638197475, + "grad_norm": 0.66796875, + "learning_rate": 7.189412095207136e-05, + "loss": 1.5037, + "num_input_tokens_seen": 2458386432, + "step": 2084 + }, + { + "epoch": 0.05721363689984665, + "grad_norm": 0.76953125, + "learning_rate": 7.188470212382968e-05, + "loss": 1.5359, + "num_input_tokens_seen": 2459566080, + "step": 2085 + }, + { + "epoch": 0.05733160316149581, + "grad_norm": 0.84375, + "learning_rate": 7.187527844427995e-05, + "loss": 1.665, + "num_input_tokens_seen": 2460745728, + "step": 2086 + }, + { + "epoch": 0.05744956942314498, + "grad_norm": 0.6875, + "learning_rate": 7.186584991485599e-05, + "loss": 1.6403, + "num_input_tokens_seen": 2461925376, + "step": 2087 + }, + { + "epoch": 0.05756753568479415, + "grad_norm": 0.7265625, + "learning_rate": 7.185641653699234e-05, + "loss": 1.3332, + "num_input_tokens_seen": 2463105024, + "step": 2088 + }, + { + "epoch": 0.05768550194644332, + "grad_norm": 0.609375, + "learning_rate": 7.184697831212435e-05, + "loss": 1.6377, + "num_input_tokens_seen": 2464284672, + "step": 2089 + }, + { + "epoch": 0.057803468208092484, + "grad_norm": 0.73828125, + "learning_rate": 7.183753524168803e-05, + "loss": 1.4486, + "num_input_tokens_seen": 2465464320, + "step": 2090 + }, + { + "epoch": 0.057921434469741656, + "grad_norm": 0.66015625, + "learning_rate": 7.182808732712018e-05, + "loss": 1.4627, + "num_input_tokens_seen": 2466643968, + "step": 2091 + }, + { + "epoch": 0.05803940073139082, + "grad_norm": 0.65234375, + "learning_rate": 7.181863456985827e-05, + "loss": 1.6655, + "num_input_tokens_seen": 2467823616, + "step": 2092 + }, + { + "epoch": 0.05815736699303999, + "grad_norm": 0.63671875, + "learning_rate": 7.180917697134061e-05, + "loss": 1.499, + "num_input_tokens_seen": 2469003264, + "step": 2093 + }, + { + "epoch": 0.05827533325468916, + "grad_norm": 0.79296875, + "learning_rate": 7.179971453300615e-05, + "loss": 1.5666, + "num_input_tokens_seen": 2470182912, + "step": 2094 + }, + { + "epoch": 0.05839329951633833, + "grad_norm": 0.62109375, + "learning_rate": 7.179024725629464e-05, + "loss": 1.754, + "num_input_tokens_seen": 2471362560, + "step": 2095 + }, + { + "epoch": 0.058511265777987494, + "grad_norm": 0.6484375, + "learning_rate": 7.178077514264652e-05, + "loss": 1.6832, + "num_input_tokens_seen": 2472542208, + "step": 2096 + }, + { + "epoch": 0.058629232039636665, + "grad_norm": 0.7109375, + "learning_rate": 7.177129819350299e-05, + "loss": 1.4757, + "num_input_tokens_seen": 2473721856, + "step": 2097 + }, + { + "epoch": 0.05874719830128583, + "grad_norm": 0.59765625, + "learning_rate": 7.1761816410306e-05, + "loss": 1.4754, + "num_input_tokens_seen": 2474901504, + "step": 2098 + }, + { + "epoch": 0.058865164562935, + "grad_norm": 0.59375, + "learning_rate": 7.175232979449819e-05, + "loss": 1.677, + "num_input_tokens_seen": 2476081152, + "step": 2099 + }, + { + "epoch": 0.058983130824584167, + "grad_norm": 0.63671875, + "learning_rate": 7.174283834752299e-05, + "loss": 1.588, + "num_input_tokens_seen": 2477260800, + "step": 2100 + }, + { + "epoch": 0.05910109708623334, + "grad_norm": 0.72265625, + "learning_rate": 7.173334207082453e-05, + "loss": 1.5977, + "num_input_tokens_seen": 2478440448, + "step": 2101 + }, + { + "epoch": 0.0592190633478825, + "grad_norm": 0.68359375, + "learning_rate": 7.172384096584769e-05, + "loss": 1.4083, + "num_input_tokens_seen": 2479620096, + "step": 2102 + }, + { + "epoch": 0.059337029609531675, + "grad_norm": 0.63671875, + "learning_rate": 7.171433503403805e-05, + "loss": 1.7229, + "num_input_tokens_seen": 2480799744, + "step": 2103 + }, + { + "epoch": 0.05945499587118084, + "grad_norm": 0.62890625, + "learning_rate": 7.170482427684199e-05, + "loss": 1.5378, + "num_input_tokens_seen": 2481979392, + "step": 2104 + }, + { + "epoch": 0.05957296213283001, + "grad_norm": 0.6796875, + "learning_rate": 7.169530869570655e-05, + "loss": 1.6472, + "num_input_tokens_seen": 2483159040, + "step": 2105 + }, + { + "epoch": 0.059690928394479176, + "grad_norm": 0.71484375, + "learning_rate": 7.168578829207958e-05, + "loss": 1.5256, + "num_input_tokens_seen": 2484338688, + "step": 2106 + }, + { + "epoch": 0.05980889465612835, + "grad_norm": 0.6171875, + "learning_rate": 7.167626306740961e-05, + "loss": 1.6106, + "num_input_tokens_seen": 2485518336, + "step": 2107 + }, + { + "epoch": 0.05992686091777751, + "grad_norm": 0.61328125, + "learning_rate": 7.16667330231459e-05, + "loss": 1.6116, + "num_input_tokens_seen": 2486697984, + "step": 2108 + }, + { + "epoch": 0.060044827179426684, + "grad_norm": 0.67578125, + "learning_rate": 7.165719816073848e-05, + "loss": 1.553, + "num_input_tokens_seen": 2487877632, + "step": 2109 + }, + { + "epoch": 0.060162793441075856, + "grad_norm": 0.7109375, + "learning_rate": 7.164765848163809e-05, + "loss": 1.5079, + "num_input_tokens_seen": 2489057280, + "step": 2110 + }, + { + "epoch": 0.06028075970272502, + "grad_norm": 0.6171875, + "learning_rate": 7.16381139872962e-05, + "loss": 1.7166, + "num_input_tokens_seen": 2490236928, + "step": 2111 + }, + { + "epoch": 0.06039872596437419, + "grad_norm": 0.69140625, + "learning_rate": 7.162856467916504e-05, + "loss": 1.5981, + "num_input_tokens_seen": 2491416576, + "step": 2112 + }, + { + "epoch": 0.06051669222602336, + "grad_norm": 0.67578125, + "learning_rate": 7.161901055869752e-05, + "loss": 1.4822, + "num_input_tokens_seen": 2492596224, + "step": 2113 + }, + { + "epoch": 0.06063465848767253, + "grad_norm": 0.65234375, + "learning_rate": 7.160945162734735e-05, + "loss": 1.6089, + "num_input_tokens_seen": 2493775872, + "step": 2114 + }, + { + "epoch": 0.06075262474932169, + "grad_norm": 0.67578125, + "learning_rate": 7.159988788656892e-05, + "loss": 1.5562, + "num_input_tokens_seen": 2494955520, + "step": 2115 + }, + { + "epoch": 0.060870591010970865, + "grad_norm": 0.65234375, + "learning_rate": 7.159031933781736e-05, + "loss": 1.5186, + "num_input_tokens_seen": 2496135168, + "step": 2116 + }, + { + "epoch": 0.06098855727262003, + "grad_norm": 0.6953125, + "learning_rate": 7.158074598254855e-05, + "loss": 1.3422, + "num_input_tokens_seen": 2497314816, + "step": 2117 + }, + { + "epoch": 0.0611065235342692, + "grad_norm": 0.59765625, + "learning_rate": 7.157116782221909e-05, + "loss": 1.7471, + "num_input_tokens_seen": 2498494464, + "step": 2118 + }, + { + "epoch": 0.061224489795918366, + "grad_norm": 0.6328125, + "learning_rate": 7.156158485828631e-05, + "loss": 1.6097, + "num_input_tokens_seen": 2499674112, + "step": 2119 + }, + { + "epoch": 0.06134245605756754, + "grad_norm": 0.63671875, + "learning_rate": 7.155199709220828e-05, + "loss": 1.4928, + "num_input_tokens_seen": 2500853760, + "step": 2120 + }, + { + "epoch": 0.0614604223192167, + "grad_norm": 0.62890625, + "learning_rate": 7.154240452544378e-05, + "loss": 1.5735, + "num_input_tokens_seen": 2502033408, + "step": 2121 + }, + { + "epoch": 0.061578388580865874, + "grad_norm": 0.72265625, + "learning_rate": 7.153280715945235e-05, + "loss": 1.5354, + "num_input_tokens_seen": 2503213056, + "step": 2122 + }, + { + "epoch": 0.06169635484251504, + "grad_norm": 0.6953125, + "learning_rate": 7.152320499569425e-05, + "loss": 1.5084, + "num_input_tokens_seen": 2504392704, + "step": 2123 + }, + { + "epoch": 0.06181432110416421, + "grad_norm": 0.6328125, + "learning_rate": 7.151359803563042e-05, + "loss": 1.4934, + "num_input_tokens_seen": 2505572352, + "step": 2124 + }, + { + "epoch": 0.061932287365813375, + "grad_norm": 0.671875, + "learning_rate": 7.150398628072263e-05, + "loss": 1.4994, + "num_input_tokens_seen": 2506752000, + "step": 2125 + }, + { + "epoch": 0.06205025362746255, + "grad_norm": 0.65234375, + "learning_rate": 7.14943697324333e-05, + "loss": 1.4492, + "num_input_tokens_seen": 2507931648, + "step": 2126 + }, + { + "epoch": 0.06216821988911171, + "grad_norm": 0.6328125, + "learning_rate": 7.148474839222559e-05, + "loss": 1.7602, + "num_input_tokens_seen": 2509111296, + "step": 2127 + }, + { + "epoch": 0.06228618615076088, + "grad_norm": 0.6484375, + "learning_rate": 7.147512226156342e-05, + "loss": 1.5096, + "num_input_tokens_seen": 2510290944, + "step": 2128 + }, + { + "epoch": 0.06240415241241005, + "grad_norm": 0.65234375, + "learning_rate": 7.146549134191142e-05, + "loss": 1.6076, + "num_input_tokens_seen": 2511470592, + "step": 2129 + }, + { + "epoch": 0.06252211867405921, + "grad_norm": 0.64453125, + "learning_rate": 7.145585563473495e-05, + "loss": 1.449, + "num_input_tokens_seen": 2512650240, + "step": 2130 + }, + { + "epoch": 0.06264008493570838, + "grad_norm": 0.6484375, + "learning_rate": 7.14462151415001e-05, + "loss": 1.5525, + "num_input_tokens_seen": 2513829888, + "step": 2131 + }, + { + "epoch": 0.06275805119735756, + "grad_norm": 0.62890625, + "learning_rate": 7.143656986367368e-05, + "loss": 1.4403, + "num_input_tokens_seen": 2515009536, + "step": 2132 + }, + { + "epoch": 0.06287601745900673, + "grad_norm": 0.62890625, + "learning_rate": 7.142691980272322e-05, + "loss": 1.5139, + "num_input_tokens_seen": 2516189184, + "step": 2133 + }, + { + "epoch": 0.06299398372065589, + "grad_norm": 0.67578125, + "learning_rate": 7.141726496011701e-05, + "loss": 1.4524, + "num_input_tokens_seen": 2517368832, + "step": 2134 + }, + { + "epoch": 0.06311194998230506, + "grad_norm": 0.68359375, + "learning_rate": 7.140760533732405e-05, + "loss": 1.4579, + "num_input_tokens_seen": 2518548480, + "step": 2135 + }, + { + "epoch": 0.06322991624395423, + "grad_norm": 0.61328125, + "learning_rate": 7.139794093581407e-05, + "loss": 1.6199, + "num_input_tokens_seen": 2519728128, + "step": 2136 + }, + { + "epoch": 0.0633478825056034, + "grad_norm": 0.83203125, + "learning_rate": 7.13882717570575e-05, + "loss": 1.6241, + "num_input_tokens_seen": 2520907776, + "step": 2137 + }, + { + "epoch": 0.06346584876725257, + "grad_norm": 0.72265625, + "learning_rate": 7.137859780252555e-05, + "loss": 1.6226, + "num_input_tokens_seen": 2522087424, + "step": 2138 + }, + { + "epoch": 0.06358381502890173, + "grad_norm": 0.78515625, + "learning_rate": 7.136891907369012e-05, + "loss": 1.507, + "num_input_tokens_seen": 2523267072, + "step": 2139 + }, + { + "epoch": 0.0637017812905509, + "grad_norm": 0.734375, + "learning_rate": 7.135923557202383e-05, + "loss": 1.3917, + "num_input_tokens_seen": 2524446720, + "step": 2140 + }, + { + "epoch": 0.06381974755220007, + "grad_norm": 0.66796875, + "learning_rate": 7.134954729900006e-05, + "loss": 1.4087, + "num_input_tokens_seen": 2525626368, + "step": 2141 + }, + { + "epoch": 0.06393771381384925, + "grad_norm": 0.7109375, + "learning_rate": 7.133985425609288e-05, + "loss": 1.5533, + "num_input_tokens_seen": 2526806016, + "step": 2142 + }, + { + "epoch": 0.0640556800754984, + "grad_norm": 0.73046875, + "learning_rate": 7.13301564447771e-05, + "loss": 1.4244, + "num_input_tokens_seen": 2527985664, + "step": 2143 + }, + { + "epoch": 0.06417364633714757, + "grad_norm": 0.62109375, + "learning_rate": 7.132045386652829e-05, + "loss": 1.5217, + "num_input_tokens_seen": 2529165312, + "step": 2144 + }, + { + "epoch": 0.06429161259879675, + "grad_norm": 0.7265625, + "learning_rate": 7.131074652282268e-05, + "loss": 1.4388, + "num_input_tokens_seen": 2530344960, + "step": 2145 + }, + { + "epoch": 0.06440957886044592, + "grad_norm": 0.6328125, + "learning_rate": 7.130103441513726e-05, + "loss": 1.6643, + "num_input_tokens_seen": 2531524608, + "step": 2146 + }, + { + "epoch": 0.06452754512209508, + "grad_norm": 0.64453125, + "learning_rate": 7.129131754494975e-05, + "loss": 1.6273, + "num_input_tokens_seen": 2532704256, + "step": 2147 + }, + { + "epoch": 0.06464551138374425, + "grad_norm": 0.6796875, + "learning_rate": 7.128159591373859e-05, + "loss": 1.5731, + "num_input_tokens_seen": 2533883904, + "step": 2148 + }, + { + "epoch": 0.06476347764539342, + "grad_norm": 0.6640625, + "learning_rate": 7.127186952298293e-05, + "loss": 1.5956, + "num_input_tokens_seen": 2535063552, + "step": 2149 + }, + { + "epoch": 0.06488144390704259, + "grad_norm": 0.703125, + "learning_rate": 7.126213837416267e-05, + "loss": 1.5309, + "num_input_tokens_seen": 2536243200, + "step": 2150 + }, + { + "epoch": 0.06499941016869175, + "grad_norm": 0.6328125, + "learning_rate": 7.125240246875841e-05, + "loss": 1.5006, + "num_input_tokens_seen": 2537422848, + "step": 2151 + }, + { + "epoch": 0.06511737643034092, + "grad_norm": 0.71484375, + "learning_rate": 7.124266180825148e-05, + "loss": 1.6113, + "num_input_tokens_seen": 2538602496, + "step": 2152 + }, + { + "epoch": 0.06523534269199009, + "grad_norm": 0.62109375, + "learning_rate": 7.123291639412395e-05, + "loss": 1.4539, + "num_input_tokens_seen": 2539782144, + "step": 2153 + }, + { + "epoch": 0.06535330895363926, + "grad_norm": 0.7109375, + "learning_rate": 7.12231662278586e-05, + "loss": 1.4511, + "num_input_tokens_seen": 2540961792, + "step": 2154 + }, + { + "epoch": 0.06547127521528842, + "grad_norm": 0.5625, + "learning_rate": 7.121341131093892e-05, + "loss": 1.5934, + "num_input_tokens_seen": 2542141440, + "step": 2155 + }, + { + "epoch": 0.0655892414769376, + "grad_norm": 0.625, + "learning_rate": 7.120365164484915e-05, + "loss": 1.5815, + "num_input_tokens_seen": 2543321088, + "step": 2156 + }, + { + "epoch": 0.06570720773858676, + "grad_norm": 0.73828125, + "learning_rate": 7.119388723107422e-05, + "loss": 1.5041, + "num_input_tokens_seen": 2544500736, + "step": 2157 + }, + { + "epoch": 0.06582517400023594, + "grad_norm": 0.62109375, + "learning_rate": 7.118411807109983e-05, + "loss": 1.5656, + "num_input_tokens_seen": 2545680384, + "step": 2158 + }, + { + "epoch": 0.0659431402618851, + "grad_norm": 0.68359375, + "learning_rate": 7.117434416641234e-05, + "loss": 1.5776, + "num_input_tokens_seen": 2546860032, + "step": 2159 + }, + { + "epoch": 0.06606110652353427, + "grad_norm": 0.64453125, + "learning_rate": 7.116456551849889e-05, + "loss": 1.4988, + "num_input_tokens_seen": 2548039680, + "step": 2160 + }, + { + "epoch": 0.06617907278518344, + "grad_norm": 0.6484375, + "learning_rate": 7.11547821288473e-05, + "loss": 1.374, + "num_input_tokens_seen": 2549219328, + "step": 2161 + }, + { + "epoch": 0.06629703904683261, + "grad_norm": 0.59375, + "learning_rate": 7.114499399894614e-05, + "loss": 1.4518, + "num_input_tokens_seen": 2550398976, + "step": 2162 + }, + { + "epoch": 0.06641500530848178, + "grad_norm": 0.6171875, + "learning_rate": 7.113520113028468e-05, + "loss": 1.6117, + "num_input_tokens_seen": 2551578624, + "step": 2163 + }, + { + "epoch": 0.06653297157013094, + "grad_norm": 0.6171875, + "learning_rate": 7.112540352435294e-05, + "loss": 1.4237, + "num_input_tokens_seen": 2552758272, + "step": 2164 + }, + { + "epoch": 0.06665093783178011, + "grad_norm": 0.66015625, + "learning_rate": 7.111560118264162e-05, + "loss": 1.4234, + "num_input_tokens_seen": 2553937920, + "step": 2165 + }, + { + "epoch": 0.06676890409342928, + "grad_norm": 0.60546875, + "learning_rate": 7.110579410664217e-05, + "loss": 1.5433, + "num_input_tokens_seen": 2555117568, + "step": 2166 + }, + { + "epoch": 0.06688687035507845, + "grad_norm": 0.6328125, + "learning_rate": 7.109598229784675e-05, + "loss": 1.6265, + "num_input_tokens_seen": 2556297216, + "step": 2167 + }, + { + "epoch": 0.06700483661672761, + "grad_norm": 0.59765625, + "learning_rate": 7.108616575774824e-05, + "loss": 1.599, + "num_input_tokens_seen": 2557476864, + "step": 2168 + }, + { + "epoch": 0.06712280287837678, + "grad_norm": 0.6796875, + "learning_rate": 7.107634448784025e-05, + "loss": 1.7717, + "num_input_tokens_seen": 2558656512, + "step": 2169 + }, + { + "epoch": 0.06724076914002595, + "grad_norm": 0.71484375, + "learning_rate": 7.106651848961711e-05, + "loss": 1.4123, + "num_input_tokens_seen": 2559836160, + "step": 2170 + }, + { + "epoch": 0.06735873540167513, + "grad_norm": 0.734375, + "learning_rate": 7.105668776457384e-05, + "loss": 1.4984, + "num_input_tokens_seen": 2561015808, + "step": 2171 + }, + { + "epoch": 0.06747670166332428, + "grad_norm": 0.7734375, + "learning_rate": 7.10468523142062e-05, + "loss": 1.5935, + "num_input_tokens_seen": 2562195456, + "step": 2172 + }, + { + "epoch": 0.06759466792497346, + "grad_norm": 0.76171875, + "learning_rate": 7.10370121400107e-05, + "loss": 1.4926, + "num_input_tokens_seen": 2563375104, + "step": 2173 + }, + { + "epoch": 0.06771263418662263, + "grad_norm": 0.8671875, + "learning_rate": 7.102716724348449e-05, + "loss": 1.45, + "num_input_tokens_seen": 2564554752, + "step": 2174 + }, + { + "epoch": 0.0678306004482718, + "grad_norm": 0.6484375, + "learning_rate": 7.101731762612554e-05, + "loss": 1.6375, + "num_input_tokens_seen": 2565734400, + "step": 2175 + }, + { + "epoch": 0.06794856670992096, + "grad_norm": 0.88671875, + "learning_rate": 7.100746328943245e-05, + "loss": 1.3712, + "num_input_tokens_seen": 2566914048, + "step": 2176 + }, + { + "epoch": 0.06806653297157013, + "grad_norm": 0.65625, + "learning_rate": 7.099760423490457e-05, + "loss": 1.5398, + "num_input_tokens_seen": 2568093696, + "step": 2177 + }, + { + "epoch": 0.0681844992332193, + "grad_norm": 0.83203125, + "learning_rate": 7.098774046404199e-05, + "loss": 1.5061, + "num_input_tokens_seen": 2569273344, + "step": 2178 + }, + { + "epoch": 0.06830246549486847, + "grad_norm": 0.69140625, + "learning_rate": 7.09778719783455e-05, + "loss": 1.4503, + "num_input_tokens_seen": 2570452992, + "step": 2179 + }, + { + "epoch": 0.06842043175651763, + "grad_norm": 0.8515625, + "learning_rate": 7.096799877931659e-05, + "loss": 1.6409, + "num_input_tokens_seen": 2571632640, + "step": 2180 + }, + { + "epoch": 0.0685383980181668, + "grad_norm": 0.671875, + "learning_rate": 7.095812086845749e-05, + "loss": 1.6075, + "num_input_tokens_seen": 2572812288, + "step": 2181 + }, + { + "epoch": 0.06865636427981597, + "grad_norm": 0.78515625, + "learning_rate": 7.094823824727114e-05, + "loss": 1.4472, + "num_input_tokens_seen": 2573991936, + "step": 2182 + }, + { + "epoch": 0.06877433054146515, + "grad_norm": 0.6796875, + "learning_rate": 7.09383509172612e-05, + "loss": 1.4211, + "num_input_tokens_seen": 2575171584, + "step": 2183 + }, + { + "epoch": 0.0688922968031143, + "grad_norm": 0.70703125, + "learning_rate": 7.092845887993201e-05, + "loss": 1.5067, + "num_input_tokens_seen": 2576351232, + "step": 2184 + }, + { + "epoch": 0.06901026306476347, + "grad_norm": 0.68359375, + "learning_rate": 7.091856213678872e-05, + "loss": 1.6708, + "num_input_tokens_seen": 2577530880, + "step": 2185 + }, + { + "epoch": 0.06912822932641265, + "grad_norm": 0.69921875, + "learning_rate": 7.09086606893371e-05, + "loss": 1.5261, + "num_input_tokens_seen": 2578710528, + "step": 2186 + }, + { + "epoch": 0.06924619558806182, + "grad_norm": 0.7109375, + "learning_rate": 7.089875453908366e-05, + "loss": 1.476, + "num_input_tokens_seen": 2579890176, + "step": 2187 + }, + { + "epoch": 0.06936416184971098, + "grad_norm": 0.8046875, + "learning_rate": 7.088884368753566e-05, + "loss": 1.587, + "num_input_tokens_seen": 2581069824, + "step": 2188 + }, + { + "epoch": 0.06948212811136015, + "grad_norm": 0.68359375, + "learning_rate": 7.087892813620104e-05, + "loss": 1.52, + "num_input_tokens_seen": 2582249472, + "step": 2189 + }, + { + "epoch": 0.06960009437300932, + "grad_norm": 0.8515625, + "learning_rate": 7.086900788658848e-05, + "loss": 1.4511, + "num_input_tokens_seen": 2583429120, + "step": 2190 + }, + { + "epoch": 0.06971806063465849, + "grad_norm": 0.61328125, + "learning_rate": 7.085908294020734e-05, + "loss": 1.5586, + "num_input_tokens_seen": 2584608768, + "step": 2191 + }, + { + "epoch": 0.06983602689630766, + "grad_norm": 0.71484375, + "learning_rate": 7.084915329856773e-05, + "loss": 1.3686, + "num_input_tokens_seen": 2585788416, + "step": 2192 + }, + { + "epoch": 0.06995399315795682, + "grad_norm": 0.65625, + "learning_rate": 7.083921896318045e-05, + "loss": 1.5593, + "num_input_tokens_seen": 2586968064, + "step": 2193 + }, + { + "epoch": 0.07007195941960599, + "grad_norm": 0.65234375, + "learning_rate": 7.082927993555704e-05, + "loss": 1.4251, + "num_input_tokens_seen": 2588147712, + "step": 2194 + }, + { + "epoch": 0.07018992568125516, + "grad_norm": 0.703125, + "learning_rate": 7.081933621720973e-05, + "loss": 1.5143, + "num_input_tokens_seen": 2589327360, + "step": 2195 + }, + { + "epoch": 0.07030789194290434, + "grad_norm": 0.6953125, + "learning_rate": 7.080938780965148e-05, + "loss": 1.6101, + "num_input_tokens_seen": 2590507008, + "step": 2196 + }, + { + "epoch": 0.0704258582045535, + "grad_norm": 0.6640625, + "learning_rate": 7.079943471439593e-05, + "loss": 1.5199, + "num_input_tokens_seen": 2591686656, + "step": 2197 + }, + { + "epoch": 0.07054382446620266, + "grad_norm": 0.67578125, + "learning_rate": 7.078947693295751e-05, + "loss": 1.6252, + "num_input_tokens_seen": 2592866304, + "step": 2198 + }, + { + "epoch": 0.07066179072785184, + "grad_norm": 0.73046875, + "learning_rate": 7.077951446685128e-05, + "loss": 1.5631, + "num_input_tokens_seen": 2594045952, + "step": 2199 + }, + { + "epoch": 0.07077975698950101, + "grad_norm": 0.58984375, + "learning_rate": 7.076954731759302e-05, + "loss": 1.6381, + "num_input_tokens_seen": 2595225600, + "step": 2200 + }, + { + "epoch": 0.07077975698950101, + "eval_wikipedia_loss": 2.2444777488708496, + "eval_wikipedia_runtime": 162.8868, + "eval_wikipedia_samples_per_second": 4.31, + "eval_wikipedia_steps_per_second": 0.184, + "num_input_tokens_seen": 2595225600, + "step": 2200 + }, + { + "epoch": 0.07077975698950101, + "eval_toxicity_loss": 4.000207901000977, + "eval_toxicity_runtime": 1.0596, + "eval_toxicity_samples_per_second": 1.887, + "eval_toxicity_steps_per_second": 0.944, + "num_input_tokens_seen": 2595225600, + "step": 2200 + }, + { + "epoch": 0.00011796626164916834, + "grad_norm": 0.93359375, + "learning_rate": 7.07595754866993e-05, + "loss": 1.4676, + "num_input_tokens_seen": 2596405248, + "step": 2201 + }, + { + "epoch": 0.0002359325232983367, + "grad_norm": 0.6796875, + "learning_rate": 7.074959897568731e-05, + "loss": 1.3075, + "num_input_tokens_seen": 2597584896, + "step": 2202 + }, + { + "epoch": 0.000353898784947505, + "grad_norm": 1.0078125, + "learning_rate": 7.073961778607501e-05, + "loss": 1.2706, + "num_input_tokens_seen": 2598764544, + "step": 2203 + }, + { + "epoch": 0.0004718650465966734, + "grad_norm": 0.71875, + "learning_rate": 7.072963191938106e-05, + "loss": 1.3502, + "num_input_tokens_seen": 2599944192, + "step": 2204 + }, + { + "epoch": 0.0005898313082458417, + "grad_norm": 0.75, + "learning_rate": 7.07196413771248e-05, + "loss": 1.4489, + "num_input_tokens_seen": 2601123840, + "step": 2205 + }, + { + "epoch": 0.00070779756989501, + "grad_norm": 0.8359375, + "learning_rate": 7.070964616082633e-05, + "loss": 1.2868, + "num_input_tokens_seen": 2602303488, + "step": 2206 + }, + { + "epoch": 0.0008257638315441783, + "grad_norm": 0.84375, + "learning_rate": 7.069964627200643e-05, + "loss": 1.3287, + "num_input_tokens_seen": 2603483136, + "step": 2207 + }, + { + "epoch": 0.0009437300931933467, + "grad_norm": 0.91796875, + "learning_rate": 7.06896417121866e-05, + "loss": 1.3439, + "num_input_tokens_seen": 2604662784, + "step": 2208 + }, + { + "epoch": 0.001061696354842515, + "grad_norm": 0.76171875, + "learning_rate": 7.067963248288905e-05, + "loss": 1.3833, + "num_input_tokens_seen": 2605842432, + "step": 2209 + }, + { + "epoch": 0.0011796626164916834, + "grad_norm": 0.83203125, + "learning_rate": 7.066961858563669e-05, + "loss": 1.2962, + "num_input_tokens_seen": 2607022080, + "step": 2210 + }, + { + "epoch": 0.0012976288781408518, + "grad_norm": 0.72265625, + "learning_rate": 7.065960002195319e-05, + "loss": 1.4127, + "num_input_tokens_seen": 2608201728, + "step": 2211 + }, + { + "epoch": 0.00141559513979002, + "grad_norm": 0.84765625, + "learning_rate": 7.064957679336284e-05, + "loss": 1.4477, + "num_input_tokens_seen": 2609381376, + "step": 2212 + }, + { + "epoch": 0.0015335614014391884, + "grad_norm": 0.8203125, + "learning_rate": 7.06395489013907e-05, + "loss": 1.2573, + "num_input_tokens_seen": 2610561024, + "step": 2213 + }, + { + "epoch": 0.0016515276630883566, + "grad_norm": 0.71875, + "learning_rate": 7.062951634756256e-05, + "loss": 1.4503, + "num_input_tokens_seen": 2611740672, + "step": 2214 + }, + { + "epoch": 0.001769493924737525, + "grad_norm": 0.8359375, + "learning_rate": 7.061947913340485e-05, + "loss": 1.2692, + "num_input_tokens_seen": 2612920320, + "step": 2215 + }, + { + "epoch": 0.0018874601863866935, + "grad_norm": 0.8203125, + "learning_rate": 7.060943726044477e-05, + "loss": 1.3646, + "num_input_tokens_seen": 2614099968, + "step": 2216 + }, + { + "epoch": 0.0020054264480358617, + "grad_norm": 0.79296875, + "learning_rate": 7.059939073021022e-05, + "loss": 1.4899, + "num_input_tokens_seen": 2615279616, + "step": 2217 + }, + { + "epoch": 0.00212339270968503, + "grad_norm": 0.89453125, + "learning_rate": 7.058933954422977e-05, + "loss": 1.432, + "num_input_tokens_seen": 2616459264, + "step": 2218 + }, + { + "epoch": 0.0022413589713341986, + "grad_norm": 0.84375, + "learning_rate": 7.057928370403272e-05, + "loss": 1.2619, + "num_input_tokens_seen": 2617638912, + "step": 2219 + }, + { + "epoch": 0.0023593252329833668, + "grad_norm": 0.91796875, + "learning_rate": 7.056922321114912e-05, + "loss": 1.3937, + "num_input_tokens_seen": 2618818560, + "step": 2220 + }, + { + "epoch": 0.002477291494632535, + "grad_norm": 0.84765625, + "learning_rate": 7.055915806710965e-05, + "loss": 1.3529, + "num_input_tokens_seen": 2619998208, + "step": 2221 + }, + { + "epoch": 0.0025952577562817036, + "grad_norm": 0.77734375, + "learning_rate": 7.054908827344575e-05, + "loss": 1.3654, + "num_input_tokens_seen": 2621177856, + "step": 2222 + }, + { + "epoch": 0.002713224017930872, + "grad_norm": 0.8203125, + "learning_rate": 7.053901383168957e-05, + "loss": 1.3837, + "num_input_tokens_seen": 2622357504, + "step": 2223 + }, + { + "epoch": 0.00283119027958004, + "grad_norm": 0.8203125, + "learning_rate": 7.052893474337394e-05, + "loss": 1.3472, + "num_input_tokens_seen": 2623537152, + "step": 2224 + }, + { + "epoch": 0.0029491565412292082, + "grad_norm": 0.8359375, + "learning_rate": 7.05188510100324e-05, + "loss": 1.4993, + "num_input_tokens_seen": 2624716800, + "step": 2225 + }, + { + "epoch": 0.003067122802878377, + "grad_norm": 0.84375, + "learning_rate": 7.050876263319922e-05, + "loss": 1.3806, + "num_input_tokens_seen": 2625896448, + "step": 2226 + }, + { + "epoch": 0.003185089064527545, + "grad_norm": 0.828125, + "learning_rate": 7.049866961440936e-05, + "loss": 1.3197, + "num_input_tokens_seen": 2627076096, + "step": 2227 + }, + { + "epoch": 0.0033030553261767133, + "grad_norm": 0.86328125, + "learning_rate": 7.04885719551985e-05, + "loss": 1.2964, + "num_input_tokens_seen": 2628255744, + "step": 2228 + }, + { + "epoch": 0.003421021587825882, + "grad_norm": 0.86328125, + "learning_rate": 7.047846965710297e-05, + "loss": 1.252, + "num_input_tokens_seen": 2629435392, + "step": 2229 + }, + { + "epoch": 0.00353898784947505, + "grad_norm": 0.9140625, + "learning_rate": 7.046836272165992e-05, + "loss": 1.3258, + "num_input_tokens_seen": 2630615040, + "step": 2230 + }, + { + "epoch": 0.0036569541111242184, + "grad_norm": 0.8515625, + "learning_rate": 7.045825115040707e-05, + "loss": 1.3816, + "num_input_tokens_seen": 2631794688, + "step": 2231 + }, + { + "epoch": 0.003774920372773387, + "grad_norm": 0.8203125, + "learning_rate": 7.044813494488296e-05, + "loss": 1.3189, + "num_input_tokens_seen": 2632974336, + "step": 2232 + }, + { + "epoch": 0.003892886634422555, + "grad_norm": 0.9140625, + "learning_rate": 7.043801410662676e-05, + "loss": 1.3323, + "num_input_tokens_seen": 2634153984, + "step": 2233 + }, + { + "epoch": 0.004010852896071723, + "grad_norm": 0.75390625, + "learning_rate": 7.042788863717838e-05, + "loss": 1.5163, + "num_input_tokens_seen": 2635333632, + "step": 2234 + }, + { + "epoch": 0.004128819157720892, + "grad_norm": 0.890625, + "learning_rate": 7.041775853807842e-05, + "loss": 1.3673, + "num_input_tokens_seen": 2636513280, + "step": 2235 + }, + { + "epoch": 0.00424678541937006, + "grad_norm": 0.75, + "learning_rate": 7.04076238108682e-05, + "loss": 1.3011, + "num_input_tokens_seen": 2637692928, + "step": 2236 + }, + { + "epoch": 0.004364751681019229, + "grad_norm": 0.83203125, + "learning_rate": 7.039748445708974e-05, + "loss": 1.3664, + "num_input_tokens_seen": 2638872576, + "step": 2237 + }, + { + "epoch": 0.004482717942668397, + "grad_norm": 0.75, + "learning_rate": 7.038734047828573e-05, + "loss": 1.477, + "num_input_tokens_seen": 2640052224, + "step": 2238 + }, + { + "epoch": 0.004600684204317565, + "grad_norm": 0.78125, + "learning_rate": 7.037719187599963e-05, + "loss": 1.4931, + "num_input_tokens_seen": 2641231872, + "step": 2239 + }, + { + "epoch": 0.0047186504659667335, + "grad_norm": 0.68359375, + "learning_rate": 7.036703865177555e-05, + "loss": 1.5855, + "num_input_tokens_seen": 2642411520, + "step": 2240 + }, + { + "epoch": 0.004836616727615902, + "grad_norm": 0.81640625, + "learning_rate": 7.035688080715829e-05, + "loss": 1.5497, + "num_input_tokens_seen": 2643591168, + "step": 2241 + }, + { + "epoch": 0.00495458298926507, + "grad_norm": 0.73046875, + "learning_rate": 7.034671834369343e-05, + "loss": 1.4416, + "num_input_tokens_seen": 2644770816, + "step": 2242 + }, + { + "epoch": 0.005072549250914238, + "grad_norm": 0.7421875, + "learning_rate": 7.033655126292719e-05, + "loss": 1.435, + "num_input_tokens_seen": 2645950464, + "step": 2243 + }, + { + "epoch": 0.005190515512563407, + "grad_norm": 0.6875, + "learning_rate": 7.03263795664065e-05, + "loss": 1.4204, + "num_input_tokens_seen": 2647130112, + "step": 2244 + }, + { + "epoch": 0.0053084817742125754, + "grad_norm": 0.6875, + "learning_rate": 7.0316203255679e-05, + "loss": 1.3734, + "num_input_tokens_seen": 2648309760, + "step": 2245 + }, + { + "epoch": 0.005426448035861744, + "grad_norm": 0.82421875, + "learning_rate": 7.030602233229301e-05, + "loss": 1.1893, + "num_input_tokens_seen": 2649489408, + "step": 2246 + }, + { + "epoch": 0.005544414297510912, + "grad_norm": 0.76171875, + "learning_rate": 7.029583679779763e-05, + "loss": 1.2596, + "num_input_tokens_seen": 2650669056, + "step": 2247 + }, + { + "epoch": 0.00566238055916008, + "grad_norm": 0.66015625, + "learning_rate": 7.028564665374255e-05, + "loss": 1.5416, + "num_input_tokens_seen": 2651848704, + "step": 2248 + }, + { + "epoch": 0.005780346820809248, + "grad_norm": 0.7421875, + "learning_rate": 7.027545190167826e-05, + "loss": 1.3309, + "num_input_tokens_seen": 2653028352, + "step": 2249 + }, + { + "epoch": 0.0058983130824584165, + "grad_norm": 0.734375, + "learning_rate": 7.026525254315585e-05, + "loss": 1.4329, + "num_input_tokens_seen": 2654208000, + "step": 2250 + }, + { + "epoch": 0.0060162793441075856, + "grad_norm": 0.79296875, + "learning_rate": 7.025504857972725e-05, + "loss": 1.2763, + "num_input_tokens_seen": 2655387648, + "step": 2251 + }, + { + "epoch": 0.006134245605756754, + "grad_norm": 0.6953125, + "learning_rate": 7.024484001294493e-05, + "loss": 1.3951, + "num_input_tokens_seen": 2656567296, + "step": 2252 + }, + { + "epoch": 0.006252211867405922, + "grad_norm": 0.7265625, + "learning_rate": 7.023462684436219e-05, + "loss": 1.3499, + "num_input_tokens_seen": 2657746944, + "step": 2253 + }, + { + "epoch": 0.00637017812905509, + "grad_norm": 0.73046875, + "learning_rate": 7.022440907553297e-05, + "loss": 1.4669, + "num_input_tokens_seen": 2658926592, + "step": 2254 + }, + { + "epoch": 0.006488144390704258, + "grad_norm": 0.7265625, + "learning_rate": 7.02141867080119e-05, + "loss": 1.4329, + "num_input_tokens_seen": 2660106240, + "step": 2255 + }, + { + "epoch": 0.006606110652353427, + "grad_norm": 0.69921875, + "learning_rate": 7.020395974335435e-05, + "loss": 1.3008, + "num_input_tokens_seen": 2661285888, + "step": 2256 + }, + { + "epoch": 0.006724076914002596, + "grad_norm": 0.6796875, + "learning_rate": 7.019372818311637e-05, + "loss": 1.5163, + "num_input_tokens_seen": 2662465536, + "step": 2257 + }, + { + "epoch": 0.006842043175651764, + "grad_norm": 0.734375, + "learning_rate": 7.018349202885469e-05, + "loss": 1.3197, + "num_input_tokens_seen": 2663645184, + "step": 2258 + }, + { + "epoch": 0.006960009437300932, + "grad_norm": 0.7265625, + "learning_rate": 7.017325128212676e-05, + "loss": 1.2889, + "num_input_tokens_seen": 2664824832, + "step": 2259 + }, + { + "epoch": 0.0070779756989501, + "grad_norm": 0.86328125, + "learning_rate": 7.016300594449075e-05, + "loss": 1.4083, + "num_input_tokens_seen": 2666004480, + "step": 2260 + }, + { + "epoch": 0.0071959419605992685, + "grad_norm": 0.72265625, + "learning_rate": 7.015275601750548e-05, + "loss": 1.3621, + "num_input_tokens_seen": 2667184128, + "step": 2261 + }, + { + "epoch": 0.007313908222248437, + "grad_norm": 0.7421875, + "learning_rate": 7.01425015027305e-05, + "loss": 1.4324, + "num_input_tokens_seen": 2668363776, + "step": 2262 + }, + { + "epoch": 0.007431874483897605, + "grad_norm": 0.7109375, + "learning_rate": 7.013224240172605e-05, + "loss": 1.3694, + "num_input_tokens_seen": 2669543424, + "step": 2263 } ], "logging_steps": 1, "max_steps": 8477, - "num_input_tokens_seen": 2114224128, + "num_input_tokens_seen": 2670723072, "num_train_epochs": 9223372036854775807, "save_steps": 200, "stateful_callbacks": { @@ -14506,7 +18328,7 @@ "attributes": {} } }, - "total_flos": 2.5678125671842316e+19, + "total_flos": 3.2427231247421407e+19, "train_batch_size": 6, "trial_name": null, "trial_params": null