{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9957173447537473, "eval_steps": 500, "global_step": 155, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.15481117367744446, "learning_rate": 2e-05, "loss": 1.1213, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.16627414524555206, "learning_rate": 4e-05, "loss": 1.1341, "step": 2 }, { "epoch": 0.02, "grad_norm": 0.14720524847507477, "learning_rate": 6e-05, "loss": 1.148, "step": 3 }, { "epoch": 0.03, "grad_norm": 0.15325099229812622, "learning_rate": 8e-05, "loss": 1.1435, "step": 4 }, { "epoch": 0.03, "grad_norm": 0.16704852879047394, "learning_rate": 0.0001, "loss": 1.0895, "step": 5 }, { "epoch": 0.04, "grad_norm": 0.11686641722917557, "learning_rate": 0.00012, "loss": 1.0784, "step": 6 }, { "epoch": 0.04, "grad_norm": 0.09641632437705994, "learning_rate": 0.00014, "loss": 1.0612, "step": 7 }, { "epoch": 0.05, "grad_norm": 0.12384118884801865, "learning_rate": 0.00016, "loss": 1.0566, "step": 8 }, { "epoch": 0.06, "grad_norm": 0.07287071645259857, "learning_rate": 0.00018, "loss": 1.0442, "step": 9 }, { "epoch": 0.06, "grad_norm": 0.07469318807125092, "learning_rate": 0.0002, "loss": 1.0083, "step": 10 }, { "epoch": 0.07, "grad_norm": 0.08364757895469666, "learning_rate": 0.00019999761633493753, "loss": 1.0169, "step": 11 }, { "epoch": 0.08, "grad_norm": 0.07763934135437012, "learning_rate": 0.0001999904654533872, "loss": 1.0348, "step": 12 }, { "epoch": 0.08, "grad_norm": 0.06744416803121567, "learning_rate": 0.0001999785476962552, "loss": 1.0123, "step": 13 }, { "epoch": 0.09, "grad_norm": 0.07724820077419281, "learning_rate": 0.00019996186363170035, "loss": 1.0188, "step": 14 }, { "epoch": 0.1, "grad_norm": 0.0746370479464531, "learning_rate": 0.00019994041405510705, "loss": 1.0164, "step": 15 }, { "epoch": 0.1, "grad_norm": 0.055804282426834106, "learning_rate": 0.00019991419998904747, "loss": 1.0587, "step": 16 }, { "epoch": 0.11, "grad_norm": 0.054163169115781784, "learning_rate": 0.00019988322268323268, "loss": 1.0149, "step": 17 }, { "epoch": 0.12, "grad_norm": 0.05897677689790726, "learning_rate": 0.00019984748361445308, "loss": 1.0136, "step": 18 }, { "epoch": 0.12, "grad_norm": 0.05603804066777229, "learning_rate": 0.00019980698448650804, "loss": 0.9996, "step": 19 }, { "epoch": 0.13, "grad_norm": 0.06250110268592834, "learning_rate": 0.0001997617272301248, "loss": 1.0145, "step": 20 }, { "epoch": 0.13, "grad_norm": 0.05678323656320572, "learning_rate": 0.000199711714002866, "loss": 1.0005, "step": 21 }, { "epoch": 0.14, "grad_norm": 0.05278163403272629, "learning_rate": 0.00019965694718902745, "loss": 1.0034, "step": 22 }, { "epoch": 0.15, "grad_norm": 0.05601625144481659, "learning_rate": 0.00019959742939952392, "loss": 0.9915, "step": 23 }, { "epoch": 0.15, "grad_norm": 0.054547134786844254, "learning_rate": 0.00019953316347176488, "loss": 1.0115, "step": 24 }, { "epoch": 0.16, "grad_norm": 0.06417939066886902, "learning_rate": 0.0001994641524695193, "loss": 0.9862, "step": 25 }, { "epoch": 0.17, "grad_norm": 0.061326365917921066, "learning_rate": 0.0001993903996827694, "loss": 0.9889, "step": 26 }, { "epoch": 0.17, "grad_norm": 0.05376205965876579, "learning_rate": 0.00019931190862755417, "loss": 0.9604, "step": 27 }, { "epoch": 0.18, "grad_norm": 0.0678999274969101, "learning_rate": 0.00019922868304580118, "loss": 1.0492, "step": 28 }, { "epoch": 0.19, "grad_norm": 0.053755760192871094, "learning_rate": 0.0001991407269051487, "loss": 0.9985, "step": 29 }, { "epoch": 0.19, "grad_norm": 0.05401955544948578, "learning_rate": 0.00019904804439875633, "loss": 0.9787, "step": 30 }, { "epoch": 0.2, "grad_norm": 0.05666874349117279, "learning_rate": 0.0001989506399451051, "loss": 0.9886, "step": 31 }, { "epoch": 0.21, "grad_norm": 0.05425805598497391, "learning_rate": 0.00019884851818778693, "loss": 1.0197, "step": 32 }, { "epoch": 0.21, "grad_norm": 0.06086369976401329, "learning_rate": 0.00019874168399528305, "loss": 0.9879, "step": 33 }, { "epoch": 0.22, "grad_norm": 0.05577366426587105, "learning_rate": 0.00019863014246073214, "loss": 0.9808, "step": 34 }, { "epoch": 0.22, "grad_norm": 0.05853752791881561, "learning_rate": 0.0001985138989016874, "loss": 0.957, "step": 35 }, { "epoch": 0.23, "grad_norm": 0.0582735501229763, "learning_rate": 0.00019839295885986296, "loss": 0.9732, "step": 36 }, { "epoch": 0.24, "grad_norm": 0.07255622744560242, "learning_rate": 0.00019826732810086998, "loss": 1.0199, "step": 37 }, { "epoch": 0.24, "grad_norm": 0.06085599586367607, "learning_rate": 0.00019813701261394136, "loss": 0.9946, "step": 38 }, { "epoch": 0.25, "grad_norm": 0.06600817292928696, "learning_rate": 0.00019800201861164664, "loss": 0.9646, "step": 39 }, { "epoch": 0.26, "grad_norm": 0.0634196326136589, "learning_rate": 0.00019786235252959553, "loss": 1.0092, "step": 40 }, { "epoch": 0.26, "grad_norm": 0.05254920944571495, "learning_rate": 0.00019771802102613127, "loss": 0.9535, "step": 41 }, { "epoch": 0.27, "grad_norm": 0.06008182466030121, "learning_rate": 0.00019756903098201308, "loss": 0.9897, "step": 42 }, { "epoch": 0.28, "grad_norm": 0.07715290039777756, "learning_rate": 0.00019741538950008818, "loss": 1.0132, "step": 43 }, { "epoch": 0.28, "grad_norm": 0.06104297190904617, "learning_rate": 0.0001972571039049533, "loss": 0.9938, "step": 44 }, { "epoch": 0.29, "grad_norm": 0.06008582562208176, "learning_rate": 0.0001970941817426052, "loss": 0.9889, "step": 45 }, { "epoch": 0.3, "grad_norm": 0.05699775367975235, "learning_rate": 0.00019692663078008132, "loss": 0.9843, "step": 46 }, { "epoch": 0.3, "grad_norm": 0.05760645866394043, "learning_rate": 0.00019675445900508909, "loss": 0.9677, "step": 47 }, { "epoch": 0.31, "grad_norm": 0.06075143814086914, "learning_rate": 0.00019657767462562544, "loss": 0.9929, "step": 48 }, { "epoch": 0.31, "grad_norm": 0.059820640832185745, "learning_rate": 0.00019639628606958533, "loss": 0.9889, "step": 49 }, { "epoch": 0.32, "grad_norm": 0.059315863996744156, "learning_rate": 0.00019621030198436006, "loss": 0.9994, "step": 50 }, { "epoch": 0.33, "grad_norm": 0.05949851870536804, "learning_rate": 0.00019601973123642492, "loss": 0.9593, "step": 51 }, { "epoch": 0.33, "grad_norm": 0.06036762520670891, "learning_rate": 0.00019582458291091663, "loss": 0.9669, "step": 52 }, { "epoch": 0.34, "grad_norm": 0.05799931660294533, "learning_rate": 0.00019562486631120006, "loss": 0.9731, "step": 53 }, { "epoch": 0.35, "grad_norm": 0.05733400583267212, "learning_rate": 0.00019542059095842485, "loss": 0.9676, "step": 54 }, { "epoch": 0.35, "grad_norm": 0.06894834339618683, "learning_rate": 0.00019521176659107142, "loss": 1.0142, "step": 55 }, { "epoch": 0.36, "grad_norm": 0.0657154992222786, "learning_rate": 0.00019499840316448673, "loss": 0.9598, "step": 56 }, { "epoch": 0.37, "grad_norm": 0.0570225715637207, "learning_rate": 0.00019478051085040975, "loss": 0.9979, "step": 57 }, { "epoch": 0.37, "grad_norm": 0.06271913647651672, "learning_rate": 0.00019455810003648637, "loss": 0.9694, "step": 58 }, { "epoch": 0.38, "grad_norm": 0.05831892415881157, "learning_rate": 0.0001943311813257743, "loss": 0.9934, "step": 59 }, { "epoch": 0.39, "grad_norm": 0.061948519200086594, "learning_rate": 0.00019409976553623766, "loss": 0.9812, "step": 60 }, { "epoch": 0.39, "grad_norm": 0.06726415455341339, "learning_rate": 0.00019386386370023103, "loss": 0.9837, "step": 61 }, { "epoch": 0.4, "grad_norm": 0.057408351451158524, "learning_rate": 0.00019362348706397373, "loss": 0.9512, "step": 62 }, { "epoch": 0.4, "grad_norm": 0.057638928294181824, "learning_rate": 0.00019337864708701357, "loss": 0.9622, "step": 63 }, { "epoch": 0.41, "grad_norm": 0.06090644374489784, "learning_rate": 0.00019312935544168048, "loss": 0.9927, "step": 64 }, { "epoch": 0.42, "grad_norm": 0.05811937153339386, "learning_rate": 0.00019287562401253022, "loss": 0.9905, "step": 65 }, { "epoch": 0.42, "grad_norm": 0.05927939713001251, "learning_rate": 0.00019261746489577765, "loss": 0.9604, "step": 66 }, { "epoch": 0.43, "grad_norm": 0.06021604314446449, "learning_rate": 0.0001923548903987201, "loss": 0.9535, "step": 67 }, { "epoch": 0.44, "grad_norm": 0.07004135102033615, "learning_rate": 0.00019208791303915063, "loss": 1.0032, "step": 68 }, { "epoch": 0.44, "grad_norm": 0.05626143515110016, "learning_rate": 0.0001918165455447614, "loss": 0.9726, "step": 69 }, { "epoch": 0.45, "grad_norm": 0.06130916625261307, "learning_rate": 0.00019154080085253666, "loss": 0.9646, "step": 70 }, { "epoch": 0.46, "grad_norm": 0.06467108428478241, "learning_rate": 0.0001912606921081362, "loss": 0.9516, "step": 71 }, { "epoch": 0.46, "grad_norm": 0.06047213450074196, "learning_rate": 0.0001909762326652686, "loss": 0.9664, "step": 72 }, { "epoch": 0.47, "grad_norm": 0.05907664820551872, "learning_rate": 0.00019068743608505455, "loss": 0.9796, "step": 73 }, { "epoch": 0.48, "grad_norm": 0.06679921597242355, "learning_rate": 0.00019039431613538047, "loss": 0.9678, "step": 74 }, { "epoch": 0.48, "grad_norm": 0.06133173033595085, "learning_rate": 0.0001900968867902419, "loss": 0.9875, "step": 75 }, { "epoch": 0.49, "grad_norm": 0.05841493234038353, "learning_rate": 0.00018979516222907775, "loss": 0.9686, "step": 76 }, { "epoch": 0.49, "grad_norm": 0.0660431906580925, "learning_rate": 0.00018948915683609388, "loss": 0.9863, "step": 77 }, { "epoch": 0.5, "grad_norm": 0.05776617303490639, "learning_rate": 0.00018917888519957754, "loss": 0.9417, "step": 78 }, { "epoch": 0.51, "grad_norm": 0.05937017872929573, "learning_rate": 0.00018886436211120193, "loss": 0.9995, "step": 79 }, { "epoch": 0.51, "grad_norm": 0.06314114481210709, "learning_rate": 0.000188545602565321, "loss": 0.9806, "step": 80 }, { "epoch": 0.52, "grad_norm": 0.060519032180309296, "learning_rate": 0.00018822262175825462, "loss": 0.9741, "step": 81 }, { "epoch": 0.53, "grad_norm": 0.06154269725084305, "learning_rate": 0.00018789543508756408, "loss": 0.9793, "step": 82 }, { "epoch": 0.53, "grad_norm": 0.06176121160387993, "learning_rate": 0.00018756405815131813, "loss": 0.9453, "step": 83 }, { "epoch": 0.54, "grad_norm": 0.06044905260205269, "learning_rate": 0.00018722850674734927, "loss": 0.9462, "step": 84 }, { "epoch": 0.55, "grad_norm": 0.05896229296922684, "learning_rate": 0.00018688879687250067, "loss": 0.9963, "step": 85 }, { "epoch": 0.55, "grad_norm": 0.06071419641375542, "learning_rate": 0.0001865449447218635, "loss": 0.9914, "step": 86 }, { "epoch": 0.56, "grad_norm": 0.0697932317852974, "learning_rate": 0.00018619696668800492, "loss": 0.9726, "step": 87 }, { "epoch": 0.57, "grad_norm": 0.062443289905786514, "learning_rate": 0.00018584487936018661, "loss": 1.0084, "step": 88 }, { "epoch": 0.57, "grad_norm": 0.059460923075675964, "learning_rate": 0.0001854886995235738, "loss": 0.9404, "step": 89 }, { "epoch": 0.58, "grad_norm": 0.058260347694158554, "learning_rate": 0.00018512844415843514, "loss": 0.9796, "step": 90 }, { "epoch": 0.58, "grad_norm": 0.05946533381938934, "learning_rate": 0.00018476413043933313, "loss": 0.9418, "step": 91 }, { "epoch": 0.59, "grad_norm": 0.06572849303483963, "learning_rate": 0.00018439577573430555, "loss": 0.9785, "step": 92 }, { "epoch": 0.6, "grad_norm": 0.06783867627382278, "learning_rate": 0.00018402339760403713, "loss": 0.9747, "step": 93 }, { "epoch": 0.6, "grad_norm": 0.06454402953386307, "learning_rate": 0.00018364701380102266, "loss": 0.9779, "step": 94 }, { "epoch": 0.61, "grad_norm": 0.06309663504362106, "learning_rate": 0.00018326664226872065, "loss": 0.9643, "step": 95 }, { "epoch": 0.62, "grad_norm": 0.05967305600643158, "learning_rate": 0.00018288230114069765, "loss": 0.9752, "step": 96 }, { "epoch": 0.62, "grad_norm": 0.05811592936515808, "learning_rate": 0.0001824940087397641, "loss": 0.9551, "step": 97 }, { "epoch": 0.63, "grad_norm": 0.0642295628786087, "learning_rate": 0.00018210178357710058, "loss": 0.9522, "step": 98 }, { "epoch": 0.64, "grad_norm": 0.05724099278450012, "learning_rate": 0.0001817056443513754, "loss": 1.0051, "step": 99 }, { "epoch": 0.64, "grad_norm": 0.056155964732170105, "learning_rate": 0.00018130560994785325, "loss": 0.9778, "step": 100 }, { "epoch": 0.65, "grad_norm": 0.058100346475839615, "learning_rate": 0.00018090169943749476, "loss": 0.9825, "step": 101 }, { "epoch": 0.66, "grad_norm": 0.06120794638991356, "learning_rate": 0.00018049393207604733, "loss": 0.9839, "step": 102 }, { "epoch": 0.66, "grad_norm": 0.056975312530994415, "learning_rate": 0.00018008232730312723, "loss": 0.9968, "step": 103 }, { "epoch": 0.67, "grad_norm": 0.06239038705825806, "learning_rate": 0.00017966690474129285, "loss": 0.9906, "step": 104 }, { "epoch": 0.67, "grad_norm": 0.5958348512649536, "learning_rate": 0.00017924768419510904, "loss": 2.6531, "step": 105 }, { "epoch": 0.68, "grad_norm": 0.06554935872554779, "learning_rate": 0.00017882468565020326, "loss": 1.0164, "step": 106 }, { "epoch": 0.69, "grad_norm": 0.05698655918240547, "learning_rate": 0.00017839792927231254, "loss": 0.9516, "step": 107 }, { "epoch": 0.69, "grad_norm": 0.06186239421367645, "learning_rate": 0.00017796743540632223, "loss": 0.9933, "step": 108 }, { "epoch": 0.7, "grad_norm": 0.05811876431107521, "learning_rate": 0.00017753322457529614, "loss": 0.9552, "step": 109 }, { "epoch": 0.71, "grad_norm": 0.06247268617153168, "learning_rate": 0.00017709531747949796, "loss": 0.9316, "step": 110 }, { "epoch": 0.71, "grad_norm": 0.06278502196073532, "learning_rate": 0.00017665373499540463, "loss": 0.9867, "step": 111 }, { "epoch": 0.72, "grad_norm": 0.06079186499118805, "learning_rate": 0.00017620849817471092, "loss": 1.0233, "step": 112 }, { "epoch": 0.73, "grad_norm": 0.05586745962500572, "learning_rate": 0.00017575962824332596, "loss": 0.9454, "step": 113 }, { "epoch": 0.73, "grad_norm": 0.059647805988788605, "learning_rate": 0.00017530714660036112, "loss": 0.9718, "step": 114 }, { "epoch": 0.74, "grad_norm": 0.060143355280160904, "learning_rate": 0.00017485107481711012, "loss": 0.9927, "step": 115 }, { "epoch": 0.75, "grad_norm": 0.4768543541431427, "learning_rate": 0.0001743914346360205, "loss": 2.4526, "step": 116 }, { "epoch": 0.75, "grad_norm": 0.059104178100824356, "learning_rate": 0.00017392824796965702, "loss": 0.9366, "step": 117 }, { "epoch": 0.76, "grad_norm": 0.06858639419078827, "learning_rate": 0.00017346153689965727, "loss": 0.9783, "step": 118 }, { "epoch": 0.76, "grad_norm": 0.06308155506849289, "learning_rate": 0.00017299132367567857, "loss": 0.9688, "step": 119 }, { "epoch": 0.77, "grad_norm": 0.0601269006729126, "learning_rate": 0.00017251763071433765, "loss": 0.9937, "step": 120 }, { "epoch": 0.78, "grad_norm": 0.06544536352157593, "learning_rate": 0.00017204048059814175, "loss": 0.9351, "step": 121 }, { "epoch": 0.78, "grad_norm": 0.06467759609222412, "learning_rate": 0.00017155989607441213, "loss": 0.9918, "step": 122 }, { "epoch": 0.79, "grad_norm": 0.061619073152542114, "learning_rate": 0.0001710759000541995, "loss": 0.9872, "step": 123 }, { "epoch": 0.8, "grad_norm": 0.06122846156358719, "learning_rate": 0.00017058851561119198, "loss": 0.968, "step": 124 }, { "epoch": 0.8, "grad_norm": 0.08277314156293869, "learning_rate": 0.00017009776598061495, "loss": 0.9869, "step": 125 }, { "epoch": 0.81, "grad_norm": 0.07559008151292801, "learning_rate": 0.00016960367455812336, "loss": 0.9804, "step": 126 }, { "epoch": 0.82, "grad_norm": 0.06251110136508942, "learning_rate": 0.00016910626489868649, "loss": 0.978, "step": 127 }, { "epoch": 0.82, "grad_norm": 0.06253345310688019, "learning_rate": 0.0001686055607154648, "loss": 0.9524, "step": 128 }, { "epoch": 0.83, "grad_norm": 0.05948334559798241, "learning_rate": 0.00016810158587867973, "loss": 0.9826, "step": 129 }, { "epoch": 0.84, "grad_norm": 0.06356865167617798, "learning_rate": 0.00016759436441447545, "loss": 0.9805, "step": 130 }, { "epoch": 0.84, "grad_norm": 0.06536010652780533, "learning_rate": 0.00016708392050377363, "loss": 1.0146, "step": 131 }, { "epoch": 0.85, "grad_norm": 0.06137322261929512, "learning_rate": 0.00016657027848112062, "loss": 0.9457, "step": 132 }, { "epoch": 0.85, "grad_norm": 0.05955340713262558, "learning_rate": 0.00016605346283352727, "loss": 0.9823, "step": 133 }, { "epoch": 0.86, "grad_norm": 0.0602475143969059, "learning_rate": 0.00016553349819930165, "loss": 1.0077, "step": 134 }, { "epoch": 0.87, "grad_norm": 0.05905039981007576, "learning_rate": 0.00016501040936687443, "loss": 0.9313, "step": 135 }, { "epoch": 0.87, "grad_norm": 0.06087419390678406, "learning_rate": 0.00016448422127361706, "loss": 0.9725, "step": 136 }, { "epoch": 0.88, "grad_norm": 0.059714119881391525, "learning_rate": 0.00016395495900465304, "loss": 0.9963, "step": 137 }, { "epoch": 0.89, "grad_norm": 0.06461544334888458, "learning_rate": 0.000163422647791662, "loss": 0.957, "step": 138 }, { "epoch": 0.89, "grad_norm": 0.06247726082801819, "learning_rate": 0.00016288731301167668, "loss": 0.9742, "step": 139 }, { "epoch": 0.9, "grad_norm": 0.06527213007211685, "learning_rate": 0.00016234898018587337, "loss": 0.9789, "step": 140 }, { "epoch": 0.91, "grad_norm": 0.059757690876722336, "learning_rate": 0.00016180767497835503, "loss": 0.9309, "step": 141 }, { "epoch": 0.91, "grad_norm": 0.059084221720695496, "learning_rate": 0.00016126342319492784, "loss": 0.9546, "step": 142 }, { "epoch": 0.92, "grad_norm": 1.778961181640625, "learning_rate": 0.00016071625078187114, "loss": 2.6066, "step": 143 }, { "epoch": 0.93, "grad_norm": 0.06468740105628967, "learning_rate": 0.00016016618382470012, "loss": 0.9472, "step": 144 }, { "epoch": 0.93, "grad_norm": 0.061647091060876846, "learning_rate": 0.00015961324854692254, "loss": 0.9836, "step": 145 }, { "epoch": 0.94, "grad_norm": 0.06061193719506264, "learning_rate": 0.0001590574713087885, "loss": 0.982, "step": 146 }, { "epoch": 0.94, "grad_norm": 0.06635820865631104, "learning_rate": 0.00015849887860603374, "loss": 0.9873, "step": 147 }, { "epoch": 0.95, "grad_norm": 0.06260058283805847, "learning_rate": 0.00015793749706861636, "loss": 0.9827, "step": 148 }, { "epoch": 0.96, "grad_norm": 0.06037148833274841, "learning_rate": 0.00015737335345944757, "loss": 1.0072, "step": 149 }, { "epoch": 0.96, "grad_norm": 0.06277037411928177, "learning_rate": 0.00015680647467311557, "loss": 0.9498, "step": 150 }, { "epoch": 0.97, "grad_norm": 0.06306284666061401, "learning_rate": 0.00015623688773460357, "loss": 0.9866, "step": 151 }, { "epoch": 0.98, "grad_norm": 0.07311715185642242, "learning_rate": 0.00015566461979800122, "loss": 0.9722, "step": 152 }, { "epoch": 0.98, "grad_norm": 0.06143077090382576, "learning_rate": 0.00015508969814521025, "loss": 0.9442, "step": 153 }, { "epoch": 0.99, "grad_norm": 0.067879818379879, "learning_rate": 0.00015451215018464387, "loss": 0.9416, "step": 154 }, { "epoch": 1.0, "grad_norm": 0.06907378137111664, "learning_rate": 0.00015393200344991995, "loss": 0.9813, "step": 155 } ], "logging_steps": 1, "max_steps": 465, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 155, "total_flos": 3.361969685082931e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }