{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4991875203119922, "eval_steps": 128, "global_step": 256, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.5149741172790527, "learning_rate": 2.0000000000000003e-06, "loss": 1.8709, "step": 1 }, { "epoch": 0.0, "eval_loss": 1.8383064270019531, "eval_runtime": 707.8127, "eval_samples_per_second": 7.169, "eval_steps_per_second": 1.793, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.48140937089920044, "learning_rate": 4.000000000000001e-06, "loss": 1.7751, "step": 2 }, { "epoch": 0.01, "grad_norm": 0.4886001944541931, "learning_rate": 6e-06, "loss": 1.795, "step": 3 }, { "epoch": 0.01, "grad_norm": 0.46349120140075684, "learning_rate": 8.000000000000001e-06, "loss": 1.7569, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.5320057272911072, "learning_rate": 1e-05, "loss": 1.9278, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.48083460330963135, "learning_rate": 1.2e-05, "loss": 1.778, "step": 6 }, { "epoch": 0.01, "grad_norm": 0.503804624080658, "learning_rate": 1.4e-05, "loss": 1.8358, "step": 7 }, { "epoch": 0.02, "grad_norm": 0.5177507400512695, "learning_rate": 1.6000000000000003e-05, "loss": 1.8655, "step": 8 }, { "epoch": 0.02, "grad_norm": 0.5006410479545593, "learning_rate": 1.8e-05, "loss": 1.8087, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.500285804271698, "learning_rate": 2e-05, "loss": 1.8254, "step": 10 }, { "epoch": 0.02, "grad_norm": 0.4819566607475281, "learning_rate": 1.9999804178263253e-05, "loss": 1.7627, "step": 11 }, { "epoch": 0.02, "grad_norm": 0.4860954284667969, "learning_rate": 1.999921672072223e-05, "loss": 1.7034, "step": 12 }, { "epoch": 0.03, "grad_norm": 0.5111412405967712, "learning_rate": 1.9998237650384324e-05, "loss": 1.7203, "step": 13 }, { "epoch": 0.03, "grad_norm": 0.500988245010376, "learning_rate": 1.9996867005594193e-05, "loss": 1.6721, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.4903103709220886, "learning_rate": 1.999510484003224e-05, "loss": 1.6167, "step": 15 }, { "epoch": 0.03, "grad_norm": 0.4756762683391571, "learning_rate": 1.999295122271253e-05, "loss": 1.57, "step": 16 }, { "epoch": 0.03, "grad_norm": 0.4689522385597229, "learning_rate": 1.999040623798008e-05, "loss": 1.5461, "step": 17 }, { "epoch": 0.04, "grad_norm": 0.5094612836837769, "learning_rate": 1.9987469985507553e-05, "loss": 1.5526, "step": 18 }, { "epoch": 0.04, "grad_norm": 0.49769631028175354, "learning_rate": 1.9984142580291368e-05, "loss": 1.5115, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.46388670802116394, "learning_rate": 1.9980424152647174e-05, "loss": 1.467, "step": 20 }, { "epoch": 0.04, "grad_norm": 0.4357146918773651, "learning_rate": 1.9976314848204762e-05, "loss": 1.3887, "step": 21 }, { "epoch": 0.04, "grad_norm": 0.440377414226532, "learning_rate": 1.997181482790236e-05, "loss": 1.3845, "step": 22 }, { "epoch": 0.04, "grad_norm": 0.4116402566432953, "learning_rate": 1.9966924267980326e-05, "loss": 1.4091, "step": 23 }, { "epoch": 0.05, "grad_norm": 0.3181552588939667, "learning_rate": 1.996164335997425e-05, "loss": 1.3324, "step": 24 }, { "epoch": 0.05, "grad_norm": 0.2932267189025879, "learning_rate": 1.995597231070744e-05, "loss": 1.315, "step": 25 }, { "epoch": 0.05, "grad_norm": 0.328800231218338, "learning_rate": 1.994991134228285e-05, "loss": 1.3334, "step": 26 }, { "epoch": 0.05, "grad_norm": 0.32027724385261536, "learning_rate": 1.9943460692074345e-05, "loss": 1.3161, "step": 27 }, { "epoch": 0.05, "grad_norm": 0.3247709274291992, "learning_rate": 1.993662061271743e-05, "loss": 1.2601, "step": 28 }, { "epoch": 0.06, "grad_norm": 0.33424896001815796, "learning_rate": 1.9929391372099352e-05, "loss": 1.2807, "step": 29 }, { "epoch": 0.06, "grad_norm": 0.28847330808639526, "learning_rate": 1.9921773253348604e-05, "loss": 1.2427, "step": 30 }, { "epoch": 0.06, "grad_norm": 0.2601753771305084, "learning_rate": 1.991376655482383e-05, "loss": 1.2602, "step": 31 }, { "epoch": 0.06, "grad_norm": 0.25505828857421875, "learning_rate": 1.9905371590102157e-05, "loss": 1.2539, "step": 32 }, { "epoch": 0.06, "grad_norm": 0.25789541006088257, "learning_rate": 1.989658868796689e-05, "loss": 1.2796, "step": 33 }, { "epoch": 0.07, "grad_norm": 0.1963696926832199, "learning_rate": 1.988741819239467e-05, "loss": 1.2533, "step": 34 }, { "epoch": 0.07, "grad_norm": 0.1652669906616211, "learning_rate": 1.9877860462541964e-05, "loss": 1.27, "step": 35 }, { "epoch": 0.07, "grad_norm": 0.15272551774978638, "learning_rate": 1.986791587273103e-05, "loss": 1.2092, "step": 36 }, { "epoch": 0.07, "grad_norm": 0.14809414744377136, "learning_rate": 1.985758481243523e-05, "loss": 1.2028, "step": 37 }, { "epoch": 0.07, "grad_norm": 0.14091093838214874, "learning_rate": 1.98468676862638e-05, "loss": 1.1737, "step": 38 }, { "epoch": 0.08, "grad_norm": 0.13234961032867432, "learning_rate": 1.9835764913945998e-05, "loss": 1.2242, "step": 39 }, { "epoch": 0.08, "grad_norm": 0.12562313675880432, "learning_rate": 1.982427693031465e-05, "loss": 1.1846, "step": 40 }, { "epoch": 0.08, "grad_norm": 0.12460777163505554, "learning_rate": 1.981240418528914e-05, "loss": 1.1954, "step": 41 }, { "epoch": 0.08, "grad_norm": 0.1261477917432785, "learning_rate": 1.9800147143857774e-05, "loss": 1.1944, "step": 42 }, { "epoch": 0.08, "grad_norm": 0.12070100754499435, "learning_rate": 1.9787506286059584e-05, "loss": 1.1814, "step": 43 }, { "epoch": 0.09, "grad_norm": 0.1318473368883133, "learning_rate": 1.9774482106965512e-05, "loss": 1.2289, "step": 44 }, { "epoch": 0.09, "grad_norm": 0.11869361251592636, "learning_rate": 1.9761075116659037e-05, "loss": 1.1507, "step": 45 }, { "epoch": 0.09, "grad_norm": 0.11668427288532257, "learning_rate": 1.974728584021618e-05, "loss": 1.1693, "step": 46 }, { "epoch": 0.09, "grad_norm": 0.12271205335855484, "learning_rate": 1.9733114817684957e-05, "loss": 1.219, "step": 47 }, { "epoch": 0.09, "grad_norm": 0.12055838108062744, "learning_rate": 1.9718562604064213e-05, "loss": 1.2424, "step": 48 }, { "epoch": 0.1, "grad_norm": 0.1191168949007988, "learning_rate": 1.97036297692819e-05, "loss": 1.2206, "step": 49 }, { "epoch": 0.1, "grad_norm": 0.11361384391784668, "learning_rate": 1.9688316898172744e-05, "loss": 1.1927, "step": 50 }, { "epoch": 0.1, "grad_norm": 0.109556645154953, "learning_rate": 1.967262459045535e-05, "loss": 1.2013, "step": 51 }, { "epoch": 0.1, "grad_norm": 0.11278169602155685, "learning_rate": 1.9656553460708707e-05, "loss": 1.2379, "step": 52 }, { "epoch": 0.1, "grad_norm": 0.11011548340320587, "learning_rate": 1.9640104138348124e-05, "loss": 1.1808, "step": 53 }, { "epoch": 0.11, "grad_norm": 0.09818632155656815, "learning_rate": 1.9623277267600574e-05, "loss": 1.1731, "step": 54 }, { "epoch": 0.11, "grad_norm": 0.1045491099357605, "learning_rate": 1.9606073507479466e-05, "loss": 1.1729, "step": 55 }, { "epoch": 0.11, "grad_norm": 0.0985143780708313, "learning_rate": 1.9588493531758843e-05, "loss": 1.165, "step": 56 }, { "epoch": 0.11, "grad_norm": 0.09513280540704727, "learning_rate": 1.9570538028946974e-05, "loss": 1.1765, "step": 57 }, { "epoch": 0.11, "grad_norm": 0.09834066778421402, "learning_rate": 1.9552207702259412e-05, "loss": 1.1411, "step": 58 }, { "epoch": 0.12, "grad_norm": 0.09748240560293198, "learning_rate": 1.9533503269591438e-05, "loss": 1.1995, "step": 59 }, { "epoch": 0.12, "grad_norm": 0.09501401335000992, "learning_rate": 1.9514425463489946e-05, "loss": 1.1414, "step": 60 }, { "epoch": 0.12, "grad_norm": 0.09078366309404373, "learning_rate": 1.9494975031124768e-05, "loss": 1.1132, "step": 61 }, { "epoch": 0.12, "grad_norm": 0.09064218401908875, "learning_rate": 1.947515273425939e-05, "loss": 1.1498, "step": 62 }, { "epoch": 0.12, "grad_norm": 0.09029112011194229, "learning_rate": 1.945495934922113e-05, "loss": 1.158, "step": 63 }, { "epoch": 0.12, "grad_norm": 0.09335145354270935, "learning_rate": 1.9434395666870735e-05, "loss": 1.181, "step": 64 }, { "epoch": 0.13, "grad_norm": 0.08959628641605377, "learning_rate": 1.9413462492571403e-05, "loss": 1.1353, "step": 65 }, { "epoch": 0.13, "grad_norm": 0.09235028922557831, "learning_rate": 1.9392160646157242e-05, "loss": 1.1566, "step": 66 }, { "epoch": 0.13, "grad_norm": 0.08852320164442062, "learning_rate": 1.937049096190117e-05, "loss": 1.1015, "step": 67 }, { "epoch": 0.13, "grad_norm": 0.09060905128717422, "learning_rate": 1.934845428848222e-05, "loss": 1.1312, "step": 68 }, { "epoch": 0.13, "grad_norm": 0.09065355360507965, "learning_rate": 1.9326051488952334e-05, "loss": 1.1456, "step": 69 }, { "epoch": 0.14, "grad_norm": 0.09140690416097641, "learning_rate": 1.9303283440702524e-05, "loss": 1.1661, "step": 70 }, { "epoch": 0.14, "grad_norm": 0.08641023188829422, "learning_rate": 1.9280151035428544e-05, "loss": 1.1153, "step": 71 }, { "epoch": 0.14, "grad_norm": 0.08729224652051926, "learning_rate": 1.9256655179095954e-05, "loss": 1.1956, "step": 72 }, { "epoch": 0.14, "grad_norm": 0.08514908701181412, "learning_rate": 1.9232796791904627e-05, "loss": 1.0969, "step": 73 }, { "epoch": 0.14, "grad_norm": 0.08789129555225372, "learning_rate": 1.9208576808252725e-05, "loss": 1.1669, "step": 74 }, { "epoch": 0.15, "grad_norm": 0.0829731896519661, "learning_rate": 1.918399617670011e-05, "loss": 1.101, "step": 75 }, { "epoch": 0.15, "grad_norm": 0.08415351063013077, "learning_rate": 1.9159055859931163e-05, "loss": 1.122, "step": 76 }, { "epoch": 0.15, "grad_norm": 0.07933653146028519, "learning_rate": 1.9133756834717118e-05, "loss": 1.1175, "step": 77 }, { "epoch": 0.15, "grad_norm": 0.0849999189376831, "learning_rate": 1.9108100091877787e-05, "loss": 1.1577, "step": 78 }, { "epoch": 0.15, "grad_norm": 0.0835108831524849, "learning_rate": 1.9082086636242757e-05, "loss": 1.1253, "step": 79 }, { "epoch": 0.16, "grad_norm": 0.07834841310977936, "learning_rate": 1.905571748661204e-05, "loss": 1.0963, "step": 80 }, { "epoch": 0.16, "grad_norm": 0.07953493297100067, "learning_rate": 1.902899367571617e-05, "loss": 1.1102, "step": 81 }, { "epoch": 0.16, "grad_norm": 0.07989759743213654, "learning_rate": 1.9001916250175764e-05, "loss": 1.1576, "step": 82 }, { "epoch": 0.16, "grad_norm": 0.07849448174238205, "learning_rate": 1.8974486270460518e-05, "loss": 1.0963, "step": 83 }, { "epoch": 0.16, "grad_norm": 0.07805287837982178, "learning_rate": 1.894670481084769e-05, "loss": 1.1364, "step": 84 }, { "epoch": 0.17, "grad_norm": 0.07698098570108414, "learning_rate": 1.8918572959380005e-05, "loss": 1.1407, "step": 85 }, { "epoch": 0.17, "grad_norm": 0.0766262486577034, "learning_rate": 1.8890091817823073e-05, "loss": 1.1225, "step": 86 }, { "epoch": 0.17, "grad_norm": 0.0798678770661354, "learning_rate": 1.8861262501622213e-05, "loss": 1.137, "step": 87 }, { "epoch": 0.17, "grad_norm": 0.07717825472354889, "learning_rate": 1.8832086139858777e-05, "loss": 1.1311, "step": 88 }, { "epoch": 0.17, "grad_norm": 0.07542562484741211, "learning_rate": 1.880256387520593e-05, "loss": 1.1066, "step": 89 }, { "epoch": 0.18, "grad_norm": 0.07316063344478607, "learning_rate": 1.8772696863883905e-05, "loss": 1.0976, "step": 90 }, { "epoch": 0.18, "grad_norm": 0.0738874301314354, "learning_rate": 1.8742486275614706e-05, "loss": 1.0901, "step": 91 }, { "epoch": 0.18, "grad_norm": 0.07698226720094681, "learning_rate": 1.8711933293576303e-05, "loss": 1.1224, "step": 92 }, { "epoch": 0.18, "grad_norm": 0.07452582567930222, "learning_rate": 1.8681039114356298e-05, "loss": 1.1399, "step": 93 }, { "epoch": 0.18, "grad_norm": 0.07452700287103653, "learning_rate": 1.8649804947905057e-05, "loss": 1.1639, "step": 94 }, { "epoch": 0.19, "grad_norm": 0.07358838617801666, "learning_rate": 1.861823201748833e-05, "loss": 1.1139, "step": 95 }, { "epoch": 0.19, "grad_norm": 0.07469804584980011, "learning_rate": 1.8586321559639316e-05, "loss": 1.1103, "step": 96 }, { "epoch": 0.19, "grad_norm": 0.07484911382198334, "learning_rate": 1.8554074824110285e-05, "loss": 1.1231, "step": 97 }, { "epoch": 0.19, "grad_norm": 0.07320189476013184, "learning_rate": 1.8521493073823583e-05, "loss": 1.1405, "step": 98 }, { "epoch": 0.19, "grad_norm": 0.07219311594963074, "learning_rate": 1.8488577584822197e-05, "loss": 1.1084, "step": 99 }, { "epoch": 0.19, "grad_norm": 0.07267658412456512, "learning_rate": 1.8455329646219767e-05, "loss": 1.109, "step": 100 }, { "epoch": 0.2, "grad_norm": 0.07124843448400497, "learning_rate": 1.8421750560150112e-05, "loss": 1.0997, "step": 101 }, { "epoch": 0.2, "grad_norm": 0.06921572983264923, "learning_rate": 1.8387841641716226e-05, "loss": 1.1095, "step": 102 }, { "epoch": 0.2, "grad_norm": 0.07149618864059448, "learning_rate": 1.835360421893876e-05, "loss": 1.1078, "step": 103 }, { "epoch": 0.2, "grad_norm": 0.07851895689964294, "learning_rate": 1.8319039632704042e-05, "loss": 1.1195, "step": 104 }, { "epoch": 0.2, "grad_norm": 0.07615454494953156, "learning_rate": 1.8284149236711527e-05, "loss": 1.0754, "step": 105 }, { "epoch": 0.21, "grad_norm": 0.07054944336414337, "learning_rate": 1.8248934397420802e-05, "loss": 1.0943, "step": 106 }, { "epoch": 0.21, "grad_norm": 0.07253159582614899, "learning_rate": 1.821339649399807e-05, "loss": 1.1263, "step": 107 }, { "epoch": 0.21, "grad_norm": 0.0729857012629509, "learning_rate": 1.817753691826212e-05, "loss": 1.0977, "step": 108 }, { "epoch": 0.21, "grad_norm": 0.07234011590480804, "learning_rate": 1.8141357074629838e-05, "loss": 1.1334, "step": 109 }, { "epoch": 0.21, "grad_norm": 0.07030120491981506, "learning_rate": 1.8104858380061178e-05, "loss": 1.0767, "step": 110 }, { "epoch": 0.22, "grad_norm": 0.07036615908145905, "learning_rate": 1.80680422640037e-05, "loss": 1.0796, "step": 111 }, { "epoch": 0.22, "grad_norm": 0.0742933601140976, "learning_rate": 1.8030910168336558e-05, "loss": 1.0671, "step": 112 }, { "epoch": 0.22, "grad_norm": 0.07065165787935257, "learning_rate": 1.7993463547314044e-05, "loss": 1.1594, "step": 113 }, { "epoch": 0.22, "grad_norm": 0.07182008028030396, "learning_rate": 1.7955703867508634e-05, "loss": 1.0936, "step": 114 }, { "epoch": 0.22, "grad_norm": 0.06882106512784958, "learning_rate": 1.791763260775354e-05, "loss": 1.1017, "step": 115 }, { "epoch": 0.23, "grad_norm": 0.07001936435699463, "learning_rate": 1.7879251259084803e-05, "loss": 1.1267, "step": 116 }, { "epoch": 0.23, "grad_norm": 0.06916490197181702, "learning_rate": 1.78405613246829e-05, "loss": 1.0787, "step": 117 }, { "epoch": 0.23, "grad_norm": 0.07149837166070938, "learning_rate": 1.7801564319813854e-05, "loss": 1.1302, "step": 118 }, { "epoch": 0.23, "grad_norm": 0.06783504039049149, "learning_rate": 1.776226177176991e-05, "loss": 1.1159, "step": 119 }, { "epoch": 0.23, "grad_norm": 0.07285293936729431, "learning_rate": 1.7722655219809718e-05, "loss": 1.0758, "step": 120 }, { "epoch": 0.24, "grad_norm": 0.07273004204034805, "learning_rate": 1.768274621509803e-05, "loss": 1.1019, "step": 121 }, { "epoch": 0.24, "grad_norm": 0.07392899692058563, "learning_rate": 1.7642536320644964e-05, "loss": 1.1111, "step": 122 }, { "epoch": 0.24, "grad_norm": 0.0693732351064682, "learning_rate": 1.7602027111244807e-05, "loss": 1.1109, "step": 123 }, { "epoch": 0.24, "grad_norm": 0.0721542090177536, "learning_rate": 1.7561220173414297e-05, "loss": 1.1246, "step": 124 }, { "epoch": 0.24, "grad_norm": 0.07002190500497818, "learning_rate": 1.7520117105330524e-05, "loss": 1.073, "step": 125 }, { "epoch": 0.25, "grad_norm": 0.0697953850030899, "learning_rate": 1.7478719516768324e-05, "loss": 1.0913, "step": 126 }, { "epoch": 0.25, "grad_norm": 0.07040461152791977, "learning_rate": 1.7437029029037233e-05, "loss": 1.1445, "step": 127 }, { "epoch": 0.25, "grad_norm": 0.07231634110212326, "learning_rate": 1.7395047274917994e-05, "loss": 1.1106, "step": 128 }, { "epoch": 0.25, "eval_loss": 1.0988876819610596, "eval_runtime": 708.4228, "eval_samples_per_second": 7.162, "eval_steps_per_second": 1.791, "step": 128 }, { "epoch": 0.25, "grad_norm": 0.0713375061750412, "learning_rate": 1.7352775898598615e-05, "loss": 1.0982, "step": 129 }, { "epoch": 0.25, "grad_norm": 0.06747942417860031, "learning_rate": 1.731021655560995e-05, "loss": 1.1017, "step": 130 }, { "epoch": 0.26, "grad_norm": 0.071540467441082, "learning_rate": 1.72673709127609e-05, "loss": 1.0859, "step": 131 }, { "epoch": 0.26, "grad_norm": 0.06861750036478043, "learning_rate": 1.7224240648073097e-05, "loss": 1.0728, "step": 132 }, { "epoch": 0.26, "grad_norm": 0.06919445842504501, "learning_rate": 1.718082745071521e-05, "loss": 1.1218, "step": 133 }, { "epoch": 0.26, "grad_norm": 0.07422851771116257, "learning_rate": 1.7137133020936783e-05, "loss": 1.0881, "step": 134 }, { "epoch": 0.26, "grad_norm": 0.07452652603387833, "learning_rate": 1.7093159070001637e-05, "loss": 1.1073, "step": 135 }, { "epoch": 0.27, "grad_norm": 0.07337850332260132, "learning_rate": 1.7048907320120867e-05, "loss": 1.1065, "step": 136 }, { "epoch": 0.27, "grad_norm": 0.07020066678524017, "learning_rate": 1.700437950438537e-05, "loss": 1.0742, "step": 137 }, { "epoch": 0.27, "grad_norm": 0.07053718715906143, "learning_rate": 1.695957736669799e-05, "loss": 1.0627, "step": 138 }, { "epoch": 0.27, "grad_norm": 0.07288292795419693, "learning_rate": 1.6914502661705216e-05, "loss": 1.0842, "step": 139 }, { "epoch": 0.27, "grad_norm": 0.07197044044733047, "learning_rate": 1.6869157154728437e-05, "loss": 1.065, "step": 140 }, { "epoch": 0.27, "grad_norm": 0.07109569013118744, "learning_rate": 1.6823542621694852e-05, "loss": 1.0996, "step": 141 }, { "epoch": 0.28, "grad_norm": 0.07084467262029648, "learning_rate": 1.677766084906787e-05, "loss": 1.0862, "step": 142 }, { "epoch": 0.28, "grad_norm": 0.07195379585027695, "learning_rate": 1.6731513633777173e-05, "loss": 1.1184, "step": 143 }, { "epoch": 0.28, "grad_norm": 0.07326792180538177, "learning_rate": 1.668510278314833e-05, "loss": 1.0867, "step": 144 }, { "epoch": 0.28, "grad_norm": 0.07582233846187592, "learning_rate": 1.6638430114832015e-05, "loss": 1.0721, "step": 145 }, { "epoch": 0.28, "grad_norm": 0.07204006612300873, "learning_rate": 1.6591497456732827e-05, "loss": 1.0565, "step": 146 }, { "epoch": 0.29, "grad_norm": 0.07225130498409271, "learning_rate": 1.6544306646937683e-05, "loss": 1.1036, "step": 147 }, { "epoch": 0.29, "grad_norm": 0.07662148773670197, "learning_rate": 1.649685953364385e-05, "loss": 1.0289, "step": 148 }, { "epoch": 0.29, "grad_norm": 0.07611638307571411, "learning_rate": 1.644915797508656e-05, "loss": 1.1068, "step": 149 }, { "epoch": 0.29, "grad_norm": 0.07609565556049347, "learning_rate": 1.6401203839466212e-05, "loss": 1.0816, "step": 150 }, { "epoch": 0.29, "grad_norm": 0.0737641304731369, "learning_rate": 1.6352999004875242e-05, "loss": 1.1016, "step": 151 }, { "epoch": 0.3, "grad_norm": 0.07359515875577927, "learning_rate": 1.630454535922452e-05, "loss": 1.0787, "step": 152 }, { "epoch": 0.3, "grad_norm": 0.07506351917982101, "learning_rate": 1.6255844800169472e-05, "loss": 1.0789, "step": 153 }, { "epoch": 0.3, "grad_norm": 0.07777760922908783, "learning_rate": 1.62068992350357e-05, "loss": 1.096, "step": 154 }, { "epoch": 0.3, "grad_norm": 0.07574637979269028, "learning_rate": 1.6157710580744322e-05, "loss": 1.1007, "step": 155 }, { "epoch": 0.3, "grad_norm": 0.07857154309749603, "learning_rate": 1.610828076373687e-05, "loss": 1.0735, "step": 156 }, { "epoch": 0.31, "grad_norm": 0.07402702420949936, "learning_rate": 1.605861171989988e-05, "loss": 1.1003, "step": 157 }, { "epoch": 0.31, "grad_norm": 0.07439373433589935, "learning_rate": 1.6008705394489032e-05, "loss": 1.0662, "step": 158 }, { "epoch": 0.31, "grad_norm": 0.07392847537994385, "learning_rate": 1.5958563742052987e-05, "loss": 1.0487, "step": 159 }, { "epoch": 0.31, "grad_norm": 0.07773245126008987, "learning_rate": 1.5908188726356843e-05, "loss": 1.1107, "step": 160 }, { "epoch": 0.31, "grad_norm": 0.07752656936645508, "learning_rate": 1.5857582320305207e-05, "loss": 1.0426, "step": 161 }, { "epoch": 0.32, "grad_norm": 0.07541097700595856, "learning_rate": 1.5806746505864947e-05, "loss": 1.081, "step": 162 }, { "epoch": 0.32, "grad_norm": 0.07938623428344727, "learning_rate": 1.5755683273987554e-05, "loss": 1.0969, "step": 163 }, { "epoch": 0.32, "grad_norm": 0.07379717379808426, "learning_rate": 1.5704394624531184e-05, "loss": 1.0763, "step": 164 }, { "epoch": 0.32, "grad_norm": 0.07850446552038193, "learning_rate": 1.5652882566182316e-05, "loss": 1.1029, "step": 165 }, { "epoch": 0.32, "grad_norm": 0.07627106457948685, "learning_rate": 1.5601149116377095e-05, "loss": 1.0611, "step": 166 }, { "epoch": 0.33, "grad_norm": 0.07577154785394669, "learning_rate": 1.554919630122232e-05, "loss": 1.0973, "step": 167 }, { "epoch": 0.33, "grad_norm": 0.07844171673059464, "learning_rate": 1.5497026155416087e-05, "loss": 1.1006, "step": 168 }, { "epoch": 0.33, "grad_norm": 0.08061926811933517, "learning_rate": 1.5444640722168114e-05, "loss": 1.0879, "step": 169 }, { "epoch": 0.33, "grad_norm": 0.07918211817741394, "learning_rate": 1.53920420531197e-05, "loss": 1.0602, "step": 170 }, { "epoch": 0.33, "grad_norm": 0.08213488012552261, "learning_rate": 1.5339232208263394e-05, "loss": 1.0798, "step": 171 }, { "epoch": 0.34, "grad_norm": 0.07898285239934921, "learning_rate": 1.5286213255862295e-05, "loss": 1.0969, "step": 172 }, { "epoch": 0.34, "grad_norm": 0.08233582973480225, "learning_rate": 1.5232987272369076e-05, "loss": 1.0699, "step": 173 }, { "epoch": 0.34, "grad_norm": 0.08074311912059784, "learning_rate": 1.5179556342344643e-05, "loss": 1.0851, "step": 174 }, { "epoch": 0.34, "grad_norm": 0.08196305483579636, "learning_rate": 1.51259225583765e-05, "loss": 1.076, "step": 175 }, { "epoch": 0.34, "grad_norm": 0.08637065440416336, "learning_rate": 1.5072088020996791e-05, "loss": 1.0989, "step": 176 }, { "epoch": 0.35, "grad_norm": 0.08313170820474625, "learning_rate": 1.5018054838600033e-05, "loss": 1.09, "step": 177 }, { "epoch": 0.35, "grad_norm": 0.08245568722486496, "learning_rate": 1.496382512736056e-05, "loss": 1.0572, "step": 178 }, { "epoch": 0.35, "grad_norm": 0.08442118763923645, "learning_rate": 1.490940101114961e-05, "loss": 1.0669, "step": 179 }, { "epoch": 0.35, "grad_norm": 0.08224523812532425, "learning_rate": 1.4854784621452176e-05, "loss": 1.0842, "step": 180 }, { "epoch": 0.35, "grad_norm": 0.08642537891864777, "learning_rate": 1.479997809728352e-05, "loss": 1.123, "step": 181 }, { "epoch": 0.35, "grad_norm": 0.08723440766334534, "learning_rate": 1.4744983585105388e-05, "loss": 1.0649, "step": 182 }, { "epoch": 0.36, "grad_norm": 0.08666212856769562, "learning_rate": 1.4689803238741955e-05, "loss": 1.0938, "step": 183 }, { "epoch": 0.36, "grad_norm": 0.09213647246360779, "learning_rate": 1.463443921929548e-05, "loss": 1.0903, "step": 184 }, { "epoch": 0.36, "grad_norm": 0.08998877555131912, "learning_rate": 1.4578893695061644e-05, "loss": 1.0778, "step": 185 }, { "epoch": 0.36, "grad_norm": 0.09158129245042801, "learning_rate": 1.4523168841444657e-05, "loss": 1.0932, "step": 186 }, { "epoch": 0.36, "grad_norm": 0.09460633993148804, "learning_rate": 1.4467266840872041e-05, "loss": 1.0691, "step": 187 }, { "epoch": 0.37, "grad_norm": 0.09502755105495453, "learning_rate": 1.441118988270916e-05, "loss": 1.0684, "step": 188 }, { "epoch": 0.37, "grad_norm": 0.09307122975587845, "learning_rate": 1.4354940163173486e-05, "loss": 1.0776, "step": 189 }, { "epoch": 0.37, "grad_norm": 0.09580650180578232, "learning_rate": 1.4298519885248574e-05, "loss": 1.0882, "step": 190 }, { "epoch": 0.37, "grad_norm": 0.09251459687948227, "learning_rate": 1.4241931258597781e-05, "loss": 1.077, "step": 191 }, { "epoch": 0.37, "grad_norm": 0.09432998299598694, "learning_rate": 1.4185176499477742e-05, "loss": 1.0012, "step": 192 }, { "epoch": 0.38, "grad_norm": 0.09586652368307114, "learning_rate": 1.4128257830651554e-05, "loss": 1.0334, "step": 193 }, { "epoch": 0.38, "grad_norm": 0.09538242220878601, "learning_rate": 1.407117748130174e-05, "loss": 1.0731, "step": 194 }, { "epoch": 0.38, "grad_norm": 0.09691152721643448, "learning_rate": 1.401393768694292e-05, "loss": 1.0412, "step": 195 }, { "epoch": 0.38, "grad_norm": 0.09779084473848343, "learning_rate": 1.3956540689334286e-05, "loss": 1.0602, "step": 196 }, { "epoch": 0.38, "grad_norm": 0.0998532623052597, "learning_rate": 1.3898988736391792e-05, "loss": 1.0261, "step": 197 }, { "epoch": 0.39, "grad_norm": 0.10739872604608536, "learning_rate": 1.384128408210011e-05, "loss": 1.0502, "step": 198 }, { "epoch": 0.39, "grad_norm": 0.11806387454271317, "learning_rate": 1.3783428986424366e-05, "loss": 1.1188, "step": 199 }, { "epoch": 0.39, "grad_norm": 0.10208501666784286, "learning_rate": 1.3725425715221625e-05, "loss": 1.0465, "step": 200 }, { "epoch": 0.39, "grad_norm": 0.1044783741235733, "learning_rate": 1.3667276540152143e-05, "loss": 1.0561, "step": 201 }, { "epoch": 0.39, "grad_norm": 0.1070132926106453, "learning_rate": 1.3608983738590414e-05, "loss": 1.0429, "step": 202 }, { "epoch": 0.4, "grad_norm": 0.11181865632534027, "learning_rate": 1.3550549593535965e-05, "loss": 1.0564, "step": 203 }, { "epoch": 0.4, "grad_norm": 0.11098324507474899, "learning_rate": 1.3491976393523952e-05, "loss": 1.0632, "step": 204 }, { "epoch": 0.4, "grad_norm": 0.10281454026699066, "learning_rate": 1.343326643253552e-05, "loss": 1.0637, "step": 205 }, { "epoch": 0.4, "grad_norm": 0.10408665239810944, "learning_rate": 1.3374422009907984e-05, "loss": 1.0701, "step": 206 }, { "epoch": 0.4, "grad_norm": 0.10533872246742249, "learning_rate": 1.3315445430244744e-05, "loss": 1.0654, "step": 207 }, { "epoch": 0.41, "grad_norm": 0.10545054078102112, "learning_rate": 1.3256339003325054e-05, "loss": 1.0518, "step": 208 }, { "epoch": 0.41, "grad_norm": 0.09894714504480362, "learning_rate": 1.3197105044013544e-05, "loss": 1.0671, "step": 209 }, { "epoch": 0.41, "grad_norm": 0.08720172196626663, "learning_rate": 1.3137745872169578e-05, "loss": 1.0127, "step": 210 }, { "epoch": 0.41, "grad_norm": 0.08827454596757889, "learning_rate": 1.3078263812556377e-05, "loss": 1.0154, "step": 211 }, { "epoch": 0.41, "grad_norm": 0.0914626345038414, "learning_rate": 1.3018661194749986e-05, "loss": 1.0201, "step": 212 }, { "epoch": 0.42, "grad_norm": 0.08843535929918289, "learning_rate": 1.295894035304803e-05, "loss": 1.0516, "step": 213 }, { "epoch": 0.42, "grad_norm": 0.08639541268348694, "learning_rate": 1.28991036263783e-05, "loss": 1.0165, "step": 214 }, { "epoch": 0.42, "grad_norm": 0.07750130444765091, "learning_rate": 1.2839153358207142e-05, "loss": 1.0223, "step": 215 }, { "epoch": 0.42, "grad_norm": 0.0824190005660057, "learning_rate": 1.2779091896447682e-05, "loss": 1.0337, "step": 216 }, { "epoch": 0.42, "grad_norm": 0.08451572805643082, "learning_rate": 1.2718921593367874e-05, "loss": 1.0542, "step": 217 }, { "epoch": 0.43, "grad_norm": 0.0857366994023323, "learning_rate": 1.2658644805498361e-05, "loss": 1.0759, "step": 218 }, { "epoch": 0.43, "grad_norm": 0.07681415975093842, "learning_rate": 1.2598263893540207e-05, "loss": 1.0506, "step": 219 }, { "epoch": 0.43, "grad_norm": 0.07856535911560059, "learning_rate": 1.2537781222272423e-05, "loss": 1.0974, "step": 220 }, { "epoch": 0.43, "grad_norm": 0.08015410602092743, "learning_rate": 1.2477199160459345e-05, "loss": 1.0604, "step": 221 }, { "epoch": 0.43, "grad_norm": 0.08314133435487747, "learning_rate": 1.2416520080757892e-05, "loss": 1.0889, "step": 222 }, { "epoch": 0.43, "grad_norm": 0.08028203994035721, "learning_rate": 1.2355746359624621e-05, "loss": 1.0281, "step": 223 }, { "epoch": 0.44, "grad_norm": 0.0775797963142395, "learning_rate": 1.2294880377222649e-05, "loss": 1.078, "step": 224 }, { "epoch": 0.44, "grad_norm": 0.08315123617649078, "learning_rate": 1.2233924517328456e-05, "loss": 1.0356, "step": 225 }, { "epoch": 0.44, "grad_norm": 0.0795183852314949, "learning_rate": 1.2172881167238515e-05, "loss": 1.0332, "step": 226 }, { "epoch": 0.44, "grad_norm": 0.0779062882065773, "learning_rate": 1.2111752717675788e-05, "loss": 0.9954, "step": 227 }, { "epoch": 0.44, "grad_norm": 0.07758854329586029, "learning_rate": 1.205054156269611e-05, "loss": 1.0242, "step": 228 }, { "epoch": 0.45, "grad_norm": 0.07713694125413895, "learning_rate": 1.1989250099594412e-05, "loss": 1.0686, "step": 229 }, { "epoch": 0.45, "grad_norm": 0.07772821933031082, "learning_rate": 1.192788072881085e-05, "loss": 1.0338, "step": 230 }, { "epoch": 0.45, "grad_norm": 0.08006665855646133, "learning_rate": 1.1866435853836773e-05, "loss": 1.0946, "step": 231 }, { "epoch": 0.45, "grad_norm": 0.0821637436747551, "learning_rate": 1.1804917881120608e-05, "loss": 1.0525, "step": 232 }, { "epoch": 0.45, "grad_norm": 0.07892850786447525, "learning_rate": 1.1743329219973609e-05, "loss": 1.0127, "step": 233 }, { "epoch": 0.46, "grad_norm": 0.07800798863172531, "learning_rate": 1.1681672282475495e-05, "loss": 1.0254, "step": 234 }, { "epoch": 0.46, "grad_norm": 0.07875402271747589, "learning_rate": 1.161994948337998e-05, "loss": 1.0319, "step": 235 }, { "epoch": 0.46, "grad_norm": 0.08178096264600754, "learning_rate": 1.1558163240020209e-05, "loss": 1.0541, "step": 236 }, { "epoch": 0.46, "grad_norm": 0.08126726001501083, "learning_rate": 1.1496315972214076e-05, "loss": 1.0681, "step": 237 }, { "epoch": 0.46, "grad_norm": 0.08104463666677475, "learning_rate": 1.1434410102169462e-05, "loss": 0.9767, "step": 238 }, { "epoch": 0.47, "grad_norm": 0.0746295303106308, "learning_rate": 1.1372448054389364e-05, "loss": 1.0586, "step": 239 }, { "epoch": 0.47, "grad_norm": 0.08171354979276657, "learning_rate": 1.1310432255576944e-05, "loss": 1.0655, "step": 240 }, { "epoch": 0.47, "grad_norm": 0.08069796115159988, "learning_rate": 1.1248365134540489e-05, "loss": 1.079, "step": 241 }, { "epoch": 0.47, "grad_norm": 0.07922904193401337, "learning_rate": 1.1186249122098282e-05, "loss": 1.0371, "step": 242 }, { "epoch": 0.47, "grad_norm": 0.07877922058105469, "learning_rate": 1.1124086650983415e-05, "loss": 1.0236, "step": 243 }, { "epoch": 0.48, "grad_norm": 0.07606945931911469, "learning_rate": 1.1061880155748497e-05, "loss": 1.0255, "step": 244 }, { "epoch": 0.48, "grad_norm": 0.08225277811288834, "learning_rate": 1.0999632072670314e-05, "loss": 1.0571, "step": 245 }, { "epoch": 0.48, "grad_norm": 0.07907744497060776, "learning_rate": 1.0937344839654416e-05, "loss": 1.0745, "step": 246 }, { "epoch": 0.48, "grad_norm": 0.07885382324457169, "learning_rate": 1.087502089613963e-05, "loss": 0.9899, "step": 247 }, { "epoch": 0.48, "grad_norm": 0.08236192911863327, "learning_rate": 1.0812662683002528e-05, "loss": 1.046, "step": 248 }, { "epoch": 0.49, "grad_norm": 0.08153583109378815, "learning_rate": 1.075027264246183e-05, "loss": 1.0769, "step": 249 }, { "epoch": 0.49, "grad_norm": 0.0847182348370552, "learning_rate": 1.068785321798276e-05, "loss": 1.0695, "step": 250 }, { "epoch": 0.49, "grad_norm": 0.07414229959249496, "learning_rate": 1.062540685418133e-05, "loss": 1.0555, "step": 251 }, { "epoch": 0.49, "grad_norm": 0.07932449132204056, "learning_rate": 1.0562935996728629e-05, "loss": 1.0644, "step": 252 }, { "epoch": 0.49, "grad_norm": 0.08247576653957367, "learning_rate": 1.0500443092255017e-05, "loss": 1.064, "step": 253 }, { "epoch": 0.5, "grad_norm": 0.07860003411769867, "learning_rate": 1.043793058825431e-05, "loss": 1.0579, "step": 254 }, { "epoch": 0.5, "grad_norm": 0.08330255001783371, "learning_rate": 1.0375400932987932e-05, "loss": 1.0218, "step": 255 }, { "epoch": 0.5, "grad_norm": 0.08150562644004822, "learning_rate": 1.0312856575389016e-05, "loss": 1.0379, "step": 256 }, { "epoch": 0.5, "eval_loss": 1.0509783029556274, "eval_runtime": 708.357, "eval_samples_per_second": 7.163, "eval_steps_per_second": 1.791, "step": 256 } ], "logging_steps": 1, "max_steps": 512, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 256, "total_flos": 2.262770368118784e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }