{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9730941704035874, "eval_steps": 500, "global_step": 220, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017937219730941704, "grad_norm": 1.1350524425506592, "learning_rate": 4e-05, "loss": 2.626, "step": 1 }, { "epoch": 0.03587443946188341, "grad_norm": 1.1098030805587769, "learning_rate": 8e-05, "loss": 2.564, "step": 2 }, { "epoch": 0.053811659192825115, "grad_norm": 1.004073143005371, "learning_rate": 0.00012, "loss": 2.5371, "step": 3 }, { "epoch": 0.07174887892376682, "grad_norm": 1.0235692262649536, "learning_rate": 0.00016, "loss": 2.3217, "step": 4 }, { "epoch": 0.08968609865470852, "grad_norm": 0.9896207451820374, "learning_rate": 0.0002, "loss": 2.1635, "step": 5 }, { "epoch": 0.10762331838565023, "grad_norm": 1.0160284042358398, "learning_rate": 0.00019906976744186048, "loss": 1.9974, "step": 6 }, { "epoch": 0.12556053811659193, "grad_norm": 1.6710572242736816, "learning_rate": 0.00019813953488372096, "loss": 1.721, "step": 7 }, { "epoch": 0.14349775784753363, "grad_norm": 1.414752721786499, "learning_rate": 0.0001972093023255814, "loss": 1.5117, "step": 8 }, { "epoch": 0.16143497757847533, "grad_norm": 1.0044751167297363, "learning_rate": 0.00019627906976744185, "loss": 1.3689, "step": 9 }, { "epoch": 0.17937219730941703, "grad_norm": 0.8621335029602051, "learning_rate": 0.00019534883720930232, "loss": 1.3251, "step": 10 }, { "epoch": 0.19730941704035873, "grad_norm": 0.7370575666427612, "learning_rate": 0.0001944186046511628, "loss": 1.2459, "step": 11 }, { "epoch": 0.21524663677130046, "grad_norm": 0.7463206648826599, "learning_rate": 0.00019348837209302326, "loss": 1.1113, "step": 12 }, { "epoch": 0.23318385650224216, "grad_norm": 0.9221929907798767, "learning_rate": 0.00019255813953488374, "loss": 1.1779, "step": 13 }, { "epoch": 0.25112107623318386, "grad_norm": 0.8653731346130371, "learning_rate": 0.0001916279069767442, "loss": 1.056, "step": 14 }, { "epoch": 0.26905829596412556, "grad_norm": 0.8368218541145325, "learning_rate": 0.00019069767441860466, "loss": 1.0355, "step": 15 }, { "epoch": 0.28699551569506726, "grad_norm": 0.9069833755493164, "learning_rate": 0.00018976744186046513, "loss": 1.076, "step": 16 }, { "epoch": 0.30493273542600896, "grad_norm": 1.010295033454895, "learning_rate": 0.00018883720930232557, "loss": 1.037, "step": 17 }, { "epoch": 0.32286995515695066, "grad_norm": 0.9165616631507874, "learning_rate": 0.00018790697674418605, "loss": 1.0841, "step": 18 }, { "epoch": 0.34080717488789236, "grad_norm": 1.2438362836837769, "learning_rate": 0.00018697674418604652, "loss": 0.928, "step": 19 }, { "epoch": 0.35874439461883406, "grad_norm": 1.1386940479278564, "learning_rate": 0.000186046511627907, "loss": 0.9742, "step": 20 }, { "epoch": 0.37668161434977576, "grad_norm": 1.0614705085754395, "learning_rate": 0.00018511627906976744, "loss": 0.986, "step": 21 }, { "epoch": 0.39461883408071746, "grad_norm": 1.1421048641204834, "learning_rate": 0.0001841860465116279, "loss": 0.9306, "step": 22 }, { "epoch": 0.4125560538116592, "grad_norm": 0.9451465606689453, "learning_rate": 0.00018325581395348838, "loss": 0.9888, "step": 23 }, { "epoch": 0.4304932735426009, "grad_norm": 0.753145158290863, "learning_rate": 0.00018232558139534886, "loss": 0.9598, "step": 24 }, { "epoch": 0.4484304932735426, "grad_norm": 0.6006896495819092, "learning_rate": 0.0001813953488372093, "loss": 0.8188, "step": 25 }, { "epoch": 0.4663677130044843, "grad_norm": 0.6499263048171997, "learning_rate": 0.00018046511627906977, "loss": 0.9449, "step": 26 }, { "epoch": 0.484304932735426, "grad_norm": 0.6340591907501221, "learning_rate": 0.00017953488372093025, "loss": 1.0643, "step": 27 }, { "epoch": 0.5022421524663677, "grad_norm": 0.7478179335594177, "learning_rate": 0.0001786046511627907, "loss": 0.9822, "step": 28 }, { "epoch": 0.5201793721973094, "grad_norm": 0.6700637936592102, "learning_rate": 0.00017767441860465117, "loss": 0.9793, "step": 29 }, { "epoch": 0.5381165919282511, "grad_norm": 0.6026176810264587, "learning_rate": 0.00017674418604651164, "loss": 1.0611, "step": 30 }, { "epoch": 0.5560538116591929, "grad_norm": 0.5661296248435974, "learning_rate": 0.0001758139534883721, "loss": 0.9952, "step": 31 }, { "epoch": 0.5739910313901345, "grad_norm": 0.6180285811424255, "learning_rate": 0.00017488372093023258, "loss": 0.9497, "step": 32 }, { "epoch": 0.5919282511210763, "grad_norm": 0.6067416667938232, "learning_rate": 0.00017395348837209303, "loss": 0.9015, "step": 33 }, { "epoch": 0.6098654708520179, "grad_norm": 0.6353489756584167, "learning_rate": 0.00017302325581395348, "loss": 0.9297, "step": 34 }, { "epoch": 0.6278026905829597, "grad_norm": 0.6017511487007141, "learning_rate": 0.00017209302325581395, "loss": 0.8634, "step": 35 }, { "epoch": 0.6457399103139013, "grad_norm": 0.701750636100769, "learning_rate": 0.00017116279069767442, "loss": 0.9088, "step": 36 }, { "epoch": 0.6636771300448431, "grad_norm": 0.6852689385414124, "learning_rate": 0.0001702325581395349, "loss": 0.9195, "step": 37 }, { "epoch": 0.6816143497757847, "grad_norm": 0.6971113681793213, "learning_rate": 0.00016930232558139537, "loss": 0.8599, "step": 38 }, { "epoch": 0.6995515695067265, "grad_norm": 0.6576591730117798, "learning_rate": 0.00016837209302325584, "loss": 0.764, "step": 39 }, { "epoch": 0.7174887892376681, "grad_norm": 0.8312844038009644, "learning_rate": 0.00016744186046511629, "loss": 0.8577, "step": 40 }, { "epoch": 0.7354260089686099, "grad_norm": 0.7586076259613037, "learning_rate": 0.00016651162790697673, "loss": 0.8069, "step": 41 }, { "epoch": 0.7533632286995515, "grad_norm": 0.6356410384178162, "learning_rate": 0.0001655813953488372, "loss": 0.7753, "step": 42 }, { "epoch": 0.7713004484304933, "grad_norm": 0.6421555280685425, "learning_rate": 0.00016465116279069768, "loss": 0.8733, "step": 43 }, { "epoch": 0.7892376681614349, "grad_norm": 0.8002834916114807, "learning_rate": 0.00016372093023255815, "loss": 1.0058, "step": 44 }, { "epoch": 0.8071748878923767, "grad_norm": 0.6567667126655579, "learning_rate": 0.00016279069767441862, "loss": 0.8771, "step": 45 }, { "epoch": 0.8251121076233184, "grad_norm": 0.5926035642623901, "learning_rate": 0.00016186046511627907, "loss": 0.8556, "step": 46 }, { "epoch": 0.8430493273542601, "grad_norm": 0.613197922706604, "learning_rate": 0.00016093023255813954, "loss": 0.85, "step": 47 }, { "epoch": 0.8609865470852018, "grad_norm": 0.7108270525932312, "learning_rate": 0.00016, "loss": 0.8383, "step": 48 }, { "epoch": 0.8789237668161435, "grad_norm": 0.6039162874221802, "learning_rate": 0.00015906976744186046, "loss": 0.898, "step": 49 }, { "epoch": 0.8968609865470852, "grad_norm": 0.6543579697608948, "learning_rate": 0.00015813953488372093, "loss": 0.8442, "step": 50 }, { "epoch": 0.9147982062780269, "grad_norm": 0.6246331334114075, "learning_rate": 0.0001572093023255814, "loss": 0.7756, "step": 51 }, { "epoch": 0.9327354260089686, "grad_norm": 0.6133050322532654, "learning_rate": 0.00015627906976744188, "loss": 0.8701, "step": 52 }, { "epoch": 0.9506726457399103, "grad_norm": 0.6625930070877075, "learning_rate": 0.00015534883720930232, "loss": 0.9818, "step": 53 }, { "epoch": 0.968609865470852, "grad_norm": 0.6724585294723511, "learning_rate": 0.0001544186046511628, "loss": 0.957, "step": 54 }, { "epoch": 0.9865470852017937, "grad_norm": 0.5864427089691162, "learning_rate": 0.00015348837209302327, "loss": 0.7588, "step": 55 }, { "epoch": 1.0134529147982063, "grad_norm": 1.4326505661010742, "learning_rate": 0.00015255813953488374, "loss": 1.2876, "step": 56 }, { "epoch": 1.031390134529148, "grad_norm": 0.7354760766029358, "learning_rate": 0.0001516279069767442, "loss": 0.6682, "step": 57 }, { "epoch": 1.0493273542600896, "grad_norm": 0.7175352573394775, "learning_rate": 0.00015069767441860466, "loss": 0.6862, "step": 58 }, { "epoch": 1.0672645739910314, "grad_norm": 0.7436112761497498, "learning_rate": 0.0001497674418604651, "loss": 0.7319, "step": 59 }, { "epoch": 1.0852017937219731, "grad_norm": 0.7228017449378967, "learning_rate": 0.00014883720930232558, "loss": 0.842, "step": 60 }, { "epoch": 1.1031390134529149, "grad_norm": 0.7761241793632507, "learning_rate": 0.00014790697674418605, "loss": 0.6666, "step": 61 }, { "epoch": 1.1210762331838564, "grad_norm": 0.8203029632568359, "learning_rate": 0.00014697674418604652, "loss": 0.7403, "step": 62 }, { "epoch": 1.1390134529147982, "grad_norm": 0.7372130751609802, "learning_rate": 0.000146046511627907, "loss": 0.7426, "step": 63 }, { "epoch": 1.15695067264574, "grad_norm": 0.7927672863006592, "learning_rate": 0.00014511627906976747, "loss": 0.7717, "step": 64 }, { "epoch": 1.1748878923766817, "grad_norm": 0.7056854367256165, "learning_rate": 0.00014418604651162791, "loss": 0.6855, "step": 65 }, { "epoch": 1.1928251121076232, "grad_norm": 0.8383380174636841, "learning_rate": 0.00014325581395348836, "loss": 0.7222, "step": 66 }, { "epoch": 1.210762331838565, "grad_norm": 0.6172988414764404, "learning_rate": 0.00014232558139534883, "loss": 0.6318, "step": 67 }, { "epoch": 1.2286995515695067, "grad_norm": 0.7639912962913513, "learning_rate": 0.0001413953488372093, "loss": 0.6523, "step": 68 }, { "epoch": 1.2466367713004485, "grad_norm": 0.7420268654823303, "learning_rate": 0.00014046511627906978, "loss": 0.7358, "step": 69 }, { "epoch": 1.2645739910313902, "grad_norm": 0.781150758266449, "learning_rate": 0.00013953488372093025, "loss": 0.6927, "step": 70 }, { "epoch": 1.2825112107623318, "grad_norm": 0.7435458302497864, "learning_rate": 0.00013860465116279072, "loss": 0.7508, "step": 71 }, { "epoch": 1.3004484304932735, "grad_norm": 0.7637338042259216, "learning_rate": 0.00013767441860465117, "loss": 0.5798, "step": 72 }, { "epoch": 1.3183856502242153, "grad_norm": 0.9575199484825134, "learning_rate": 0.00013674418604651162, "loss": 0.8045, "step": 73 }, { "epoch": 1.336322869955157, "grad_norm": 0.836318850517273, "learning_rate": 0.0001358139534883721, "loss": 0.7916, "step": 74 }, { "epoch": 1.3542600896860986, "grad_norm": 0.7818664908409119, "learning_rate": 0.00013488372093023256, "loss": 0.7037, "step": 75 }, { "epoch": 1.3721973094170403, "grad_norm": 0.7612494826316833, "learning_rate": 0.00013395348837209303, "loss": 0.6847, "step": 76 }, { "epoch": 1.390134529147982, "grad_norm": 0.6829874515533447, "learning_rate": 0.0001330232558139535, "loss": 0.6848, "step": 77 }, { "epoch": 1.4080717488789238, "grad_norm": 0.6923062801361084, "learning_rate": 0.00013209302325581395, "loss": 0.6928, "step": 78 }, { "epoch": 1.4260089686098656, "grad_norm": 0.6936827898025513, "learning_rate": 0.00013116279069767442, "loss": 0.8813, "step": 79 }, { "epoch": 1.4439461883408071, "grad_norm": 0.7367523908615112, "learning_rate": 0.0001302325581395349, "loss": 0.7386, "step": 80 }, { "epoch": 1.4618834080717489, "grad_norm": 0.7084159255027771, "learning_rate": 0.00012930232558139534, "loss": 0.604, "step": 81 }, { "epoch": 1.4798206278026906, "grad_norm": 0.817794144153595, "learning_rate": 0.00012837209302325582, "loss": 0.7644, "step": 82 }, { "epoch": 1.4977578475336322, "grad_norm": 0.7807640433311462, "learning_rate": 0.0001274418604651163, "loss": 0.8061, "step": 83 }, { "epoch": 1.515695067264574, "grad_norm": 0.7616767883300781, "learning_rate": 0.00012651162790697676, "loss": 0.7786, "step": 84 }, { "epoch": 1.5336322869955157, "grad_norm": 0.7925138473510742, "learning_rate": 0.0001255813953488372, "loss": 0.6859, "step": 85 }, { "epoch": 1.5515695067264574, "grad_norm": 0.7205699682235718, "learning_rate": 0.00012465116279069768, "loss": 0.7581, "step": 86 }, { "epoch": 1.5695067264573992, "grad_norm": 0.6984810829162598, "learning_rate": 0.00012372093023255815, "loss": 0.6652, "step": 87 }, { "epoch": 1.587443946188341, "grad_norm": 0.7267066836357117, "learning_rate": 0.00012279069767441863, "loss": 0.6419, "step": 88 }, { "epoch": 1.6053811659192825, "grad_norm": 0.7686505913734436, "learning_rate": 0.00012186046511627907, "loss": 0.7497, "step": 89 }, { "epoch": 1.6233183856502242, "grad_norm": 0.755163311958313, "learning_rate": 0.00012093023255813953, "loss": 0.7757, "step": 90 }, { "epoch": 1.6412556053811658, "grad_norm": 0.7927355766296387, "learning_rate": 0.00012, "loss": 0.7754, "step": 91 }, { "epoch": 1.6591928251121075, "grad_norm": 0.6950364708900452, "learning_rate": 0.00011906976744186048, "loss": 0.6059, "step": 92 }, { "epoch": 1.6771300448430493, "grad_norm": 0.7365448474884033, "learning_rate": 0.00011813953488372094, "loss": 0.673, "step": 93 }, { "epoch": 1.695067264573991, "grad_norm": 0.7940488457679749, "learning_rate": 0.00011720930232558141, "loss": 0.5726, "step": 94 }, { "epoch": 1.7130044843049328, "grad_norm": 0.8307069540023804, "learning_rate": 0.00011627906976744187, "loss": 0.6949, "step": 95 }, { "epoch": 1.7309417040358746, "grad_norm": 0.876649796962738, "learning_rate": 0.00011534883720930234, "loss": 0.6415, "step": 96 }, { "epoch": 1.7488789237668163, "grad_norm": 0.9207577109336853, "learning_rate": 0.00011441860465116279, "loss": 0.674, "step": 97 }, { "epoch": 1.7668161434977578, "grad_norm": 0.8050037026405334, "learning_rate": 0.00011348837209302326, "loss": 0.5656, "step": 98 }, { "epoch": 1.7847533632286996, "grad_norm": 0.878441333770752, "learning_rate": 0.00011255813953488372, "loss": 0.7596, "step": 99 }, { "epoch": 1.8026905829596411, "grad_norm": 0.91168612241745, "learning_rate": 0.00011162790697674419, "loss": 0.8177, "step": 100 }, { "epoch": 1.8206278026905829, "grad_norm": 0.7757384777069092, "learning_rate": 0.00011069767441860466, "loss": 0.652, "step": 101 }, { "epoch": 1.8385650224215246, "grad_norm": 0.8266631960868835, "learning_rate": 0.00010976744186046512, "loss": 0.8207, "step": 102 }, { "epoch": 1.8565022421524664, "grad_norm": 0.7787818312644958, "learning_rate": 0.0001088372093023256, "loss": 0.6802, "step": 103 }, { "epoch": 1.8744394618834082, "grad_norm": 0.8642717003822327, "learning_rate": 0.00010790697674418607, "loss": 0.7011, "step": 104 }, { "epoch": 1.89237668161435, "grad_norm": 0.7537955641746521, "learning_rate": 0.00010697674418604651, "loss": 0.6297, "step": 105 }, { "epoch": 1.9103139013452914, "grad_norm": 0.8498083353042603, "learning_rate": 0.00010604651162790697, "loss": 0.6764, "step": 106 }, { "epoch": 1.9282511210762332, "grad_norm": 0.8197365403175354, "learning_rate": 0.00010511627906976745, "loss": 0.7126, "step": 107 }, { "epoch": 1.9461883408071747, "grad_norm": 0.797406792640686, "learning_rate": 0.0001041860465116279, "loss": 0.7664, "step": 108 }, { "epoch": 1.9641255605381165, "grad_norm": 0.8513347506523132, "learning_rate": 0.00010325581395348838, "loss": 0.7572, "step": 109 }, { "epoch": 1.9820627802690582, "grad_norm": 0.8408546447753906, "learning_rate": 0.00010232558139534885, "loss": 0.5291, "step": 110 }, { "epoch": 2.008968609865471, "grad_norm": 1.9233133792877197, "learning_rate": 0.00010139534883720931, "loss": 1.3532, "step": 111 }, { "epoch": 2.0269058295964126, "grad_norm": 0.782871663570404, "learning_rate": 0.00010046511627906978, "loss": 0.4679, "step": 112 }, { "epoch": 2.0448430493273544, "grad_norm": 0.7588692307472229, "learning_rate": 9.953488372093024e-05, "loss": 0.4962, "step": 113 }, { "epoch": 2.062780269058296, "grad_norm": 0.8635579943656921, "learning_rate": 9.86046511627907e-05, "loss": 0.474, "step": 114 }, { "epoch": 2.0807174887892375, "grad_norm": 0.8822935819625854, "learning_rate": 9.767441860465116e-05, "loss": 0.4129, "step": 115 }, { "epoch": 2.098654708520179, "grad_norm": 0.9706945419311523, "learning_rate": 9.674418604651163e-05, "loss": 0.4961, "step": 116 }, { "epoch": 2.116591928251121, "grad_norm": 1.1239269971847534, "learning_rate": 9.58139534883721e-05, "loss": 0.5805, "step": 117 }, { "epoch": 2.1345291479820627, "grad_norm": 1.1263116598129272, "learning_rate": 9.488372093023256e-05, "loss": 0.4252, "step": 118 }, { "epoch": 2.1524663677130045, "grad_norm": 1.2772897481918335, "learning_rate": 9.395348837209302e-05, "loss": 0.4534, "step": 119 }, { "epoch": 2.1704035874439462, "grad_norm": 1.0017895698547363, "learning_rate": 9.30232558139535e-05, "loss": 0.3835, "step": 120 }, { "epoch": 2.188340807174888, "grad_norm": 1.1340886354446411, "learning_rate": 9.209302325581396e-05, "loss": 0.4089, "step": 121 }, { "epoch": 2.2062780269058297, "grad_norm": 1.011704921722412, "learning_rate": 9.116279069767443e-05, "loss": 0.4298, "step": 122 }, { "epoch": 2.2242152466367715, "grad_norm": 0.9829872250556946, "learning_rate": 9.023255813953489e-05, "loss": 0.4568, "step": 123 }, { "epoch": 2.242152466367713, "grad_norm": 1.0418094396591187, "learning_rate": 8.930232558139535e-05, "loss": 0.5086, "step": 124 }, { "epoch": 2.2600896860986546, "grad_norm": 0.9814426302909851, "learning_rate": 8.837209302325582e-05, "loss": 0.5033, "step": 125 }, { "epoch": 2.2780269058295963, "grad_norm": 1.0506559610366821, "learning_rate": 8.744186046511629e-05, "loss": 0.455, "step": 126 }, { "epoch": 2.295964125560538, "grad_norm": 0.9949473738670349, "learning_rate": 8.651162790697674e-05, "loss": 0.4637, "step": 127 }, { "epoch": 2.31390134529148, "grad_norm": 1.0186420679092407, "learning_rate": 8.558139534883721e-05, "loss": 0.4877, "step": 128 }, { "epoch": 2.3318385650224216, "grad_norm": 1.0602444410324097, "learning_rate": 8.465116279069768e-05, "loss": 0.4941, "step": 129 }, { "epoch": 2.3497757847533634, "grad_norm": 1.0107648372650146, "learning_rate": 8.372093023255814e-05, "loss": 0.4339, "step": 130 }, { "epoch": 2.367713004484305, "grad_norm": 1.1475372314453125, "learning_rate": 8.27906976744186e-05, "loss": 0.5158, "step": 131 }, { "epoch": 2.3856502242152464, "grad_norm": 1.0330064296722412, "learning_rate": 8.186046511627907e-05, "loss": 0.4266, "step": 132 }, { "epoch": 2.403587443946188, "grad_norm": 1.3457512855529785, "learning_rate": 8.093023255813953e-05, "loss": 0.617, "step": 133 }, { "epoch": 2.42152466367713, "grad_norm": 1.1562917232513428, "learning_rate": 8e-05, "loss": 0.4976, "step": 134 }, { "epoch": 2.4394618834080717, "grad_norm": 1.087751030921936, "learning_rate": 7.906976744186047e-05, "loss": 0.4836, "step": 135 }, { "epoch": 2.4573991031390134, "grad_norm": 1.0045045614242554, "learning_rate": 7.813953488372094e-05, "loss": 0.4294, "step": 136 }, { "epoch": 2.475336322869955, "grad_norm": 1.0355446338653564, "learning_rate": 7.72093023255814e-05, "loss": 0.4472, "step": 137 }, { "epoch": 2.493273542600897, "grad_norm": 1.163203239440918, "learning_rate": 7.627906976744187e-05, "loss": 0.4853, "step": 138 }, { "epoch": 2.5112107623318387, "grad_norm": 1.0705980062484741, "learning_rate": 7.534883720930233e-05, "loss": 0.4142, "step": 139 }, { "epoch": 2.5291479820627805, "grad_norm": 1.172975778579712, "learning_rate": 7.441860465116279e-05, "loss": 0.5348, "step": 140 }, { "epoch": 2.547085201793722, "grad_norm": 0.9890033006668091, "learning_rate": 7.348837209302326e-05, "loss": 0.3867, "step": 141 }, { "epoch": 2.5650224215246635, "grad_norm": 1.3716145753860474, "learning_rate": 7.255813953488373e-05, "loss": 0.4829, "step": 142 }, { "epoch": 2.5829596412556053, "grad_norm": 1.1363354921340942, "learning_rate": 7.162790697674418e-05, "loss": 0.4112, "step": 143 }, { "epoch": 2.600896860986547, "grad_norm": 1.180514931678772, "learning_rate": 7.069767441860465e-05, "loss": 0.4212, "step": 144 }, { "epoch": 2.618834080717489, "grad_norm": 1.1589065790176392, "learning_rate": 6.976744186046513e-05, "loss": 0.4382, "step": 145 }, { "epoch": 2.6367713004484306, "grad_norm": 1.1208486557006836, "learning_rate": 6.883720930232558e-05, "loss": 0.4852, "step": 146 }, { "epoch": 2.6547085201793723, "grad_norm": 1.1670925617218018, "learning_rate": 6.790697674418604e-05, "loss": 0.4801, "step": 147 }, { "epoch": 2.672645739910314, "grad_norm": 1.1497890949249268, "learning_rate": 6.697674418604652e-05, "loss": 0.4581, "step": 148 }, { "epoch": 2.6905829596412554, "grad_norm": 1.1380338668823242, "learning_rate": 6.604651162790698e-05, "loss": 0.4974, "step": 149 }, { "epoch": 2.708520179372197, "grad_norm": 1.2095478773117065, "learning_rate": 6.511627906976745e-05, "loss": 0.4958, "step": 150 }, { "epoch": 2.726457399103139, "grad_norm": 1.1369256973266602, "learning_rate": 6.418604651162791e-05, "loss": 0.4518, "step": 151 }, { "epoch": 2.7443946188340806, "grad_norm": 1.1578013896942139, "learning_rate": 6.325581395348838e-05, "loss": 0.5655, "step": 152 }, { "epoch": 2.7623318385650224, "grad_norm": 1.0805268287658691, "learning_rate": 6.232558139534884e-05, "loss": 0.5004, "step": 153 }, { "epoch": 2.780269058295964, "grad_norm": 1.1408129930496216, "learning_rate": 6.139534883720931e-05, "loss": 0.4059, "step": 154 }, { "epoch": 2.798206278026906, "grad_norm": 1.0206074714660645, "learning_rate": 6.0465116279069765e-05, "loss": 0.4119, "step": 155 }, { "epoch": 2.8161434977578477, "grad_norm": 0.9685718417167664, "learning_rate": 5.953488372093024e-05, "loss": 0.4574, "step": 156 }, { "epoch": 2.8340807174887894, "grad_norm": 1.0425866842269897, "learning_rate": 5.8604651162790704e-05, "loss": 0.4774, "step": 157 }, { "epoch": 2.852017937219731, "grad_norm": 1.0325255393981934, "learning_rate": 5.767441860465117e-05, "loss": 0.4317, "step": 158 }, { "epoch": 2.8699551569506725, "grad_norm": 1.0784574747085571, "learning_rate": 5.674418604651163e-05, "loss": 0.3921, "step": 159 }, { "epoch": 2.8878923766816142, "grad_norm": 1.081007957458496, "learning_rate": 5.5813953488372095e-05, "loss": 0.4515, "step": 160 }, { "epoch": 2.905829596412556, "grad_norm": 1.1916303634643555, "learning_rate": 5.488372093023256e-05, "loss": 0.4556, "step": 161 }, { "epoch": 2.9237668161434978, "grad_norm": 1.2342188358306885, "learning_rate": 5.3953488372093034e-05, "loss": 0.4772, "step": 162 }, { "epoch": 2.9417040358744395, "grad_norm": 1.0315567255020142, "learning_rate": 5.3023255813953486e-05, "loss": 0.3233, "step": 163 }, { "epoch": 2.9596412556053813, "grad_norm": 1.3380693197250366, "learning_rate": 5.209302325581395e-05, "loss": 0.5098, "step": 164 }, { "epoch": 2.977578475336323, "grad_norm": 1.2268236875534058, "learning_rate": 5.1162790697674425e-05, "loss": 0.4125, "step": 165 }, { "epoch": 3.004484304932735, "grad_norm": 2.589136838912964, "learning_rate": 5.023255813953489e-05, "loss": 0.7637, "step": 166 }, { "epoch": 3.022421524663677, "grad_norm": 0.9620673060417175, "learning_rate": 4.930232558139535e-05, "loss": 0.2501, "step": 167 }, { "epoch": 3.0403587443946187, "grad_norm": 1.0851328372955322, "learning_rate": 4.8372093023255816e-05, "loss": 0.3299, "step": 168 }, { "epoch": 3.0582959641255605, "grad_norm": 1.081047773361206, "learning_rate": 4.744186046511628e-05, "loss": 0.319, "step": 169 }, { "epoch": 3.0762331838565022, "grad_norm": 0.9016939997673035, "learning_rate": 4.651162790697675e-05, "loss": 0.2505, "step": 170 }, { "epoch": 3.094170403587444, "grad_norm": 1.279685616493225, "learning_rate": 4.5581395348837214e-05, "loss": 0.3574, "step": 171 }, { "epoch": 3.1121076233183858, "grad_norm": 1.1288567781448364, "learning_rate": 4.465116279069767e-05, "loss": 0.2322, "step": 172 }, { "epoch": 3.1300448430493275, "grad_norm": 1.0982707738876343, "learning_rate": 4.3720930232558146e-05, "loss": 0.2557, "step": 173 }, { "epoch": 3.1479820627802693, "grad_norm": 1.2716487646102905, "learning_rate": 4.2790697674418605e-05, "loss": 0.2683, "step": 174 }, { "epoch": 3.1659192825112106, "grad_norm": 1.277907371520996, "learning_rate": 4.186046511627907e-05, "loss": 0.3359, "step": 175 }, { "epoch": 3.1838565022421523, "grad_norm": 1.767809510231018, "learning_rate": 4.093023255813954e-05, "loss": 0.3141, "step": 176 }, { "epoch": 3.201793721973094, "grad_norm": 1.5723196268081665, "learning_rate": 4e-05, "loss": 0.2696, "step": 177 }, { "epoch": 3.219730941704036, "grad_norm": 1.2438582181930542, "learning_rate": 3.906976744186047e-05, "loss": 0.2261, "step": 178 }, { "epoch": 3.2376681614349776, "grad_norm": 1.3772393465042114, "learning_rate": 3.8139534883720935e-05, "loss": 0.2699, "step": 179 }, { "epoch": 3.2556053811659194, "grad_norm": 1.1731289625167847, "learning_rate": 3.7209302325581394e-05, "loss": 0.2809, "step": 180 }, { "epoch": 3.273542600896861, "grad_norm": 1.3203359842300415, "learning_rate": 3.627906976744187e-05, "loss": 0.2798, "step": 181 }, { "epoch": 3.291479820627803, "grad_norm": 1.0982232093811035, "learning_rate": 3.5348837209302326e-05, "loss": 0.2503, "step": 182 }, { "epoch": 3.3094170403587446, "grad_norm": 1.2753369808197021, "learning_rate": 3.441860465116279e-05, "loss": 0.2893, "step": 183 }, { "epoch": 3.327354260089686, "grad_norm": 1.2293989658355713, "learning_rate": 3.348837209302326e-05, "loss": 0.2585, "step": 184 }, { "epoch": 3.3452914798206277, "grad_norm": 1.3043240308761597, "learning_rate": 3.2558139534883724e-05, "loss": 0.2203, "step": 185 }, { "epoch": 3.3632286995515694, "grad_norm": 1.1034027338027954, "learning_rate": 3.162790697674419e-05, "loss": 0.2501, "step": 186 }, { "epoch": 3.381165919282511, "grad_norm": 0.9731037020683289, "learning_rate": 3.0697674418604656e-05, "loss": 0.1944, "step": 187 }, { "epoch": 3.399103139013453, "grad_norm": 1.069287657737732, "learning_rate": 2.976744186046512e-05, "loss": 0.2041, "step": 188 }, { "epoch": 3.4170403587443947, "grad_norm": 1.3233182430267334, "learning_rate": 2.8837209302325585e-05, "loss": 0.2713, "step": 189 }, { "epoch": 3.4349775784753365, "grad_norm": 1.2428154945373535, "learning_rate": 2.7906976744186048e-05, "loss": 0.2002, "step": 190 }, { "epoch": 3.452914798206278, "grad_norm": 1.20328688621521, "learning_rate": 2.6976744186046517e-05, "loss": 0.2646, "step": 191 }, { "epoch": 3.4708520179372195, "grad_norm": 1.3479125499725342, "learning_rate": 2.6046511627906976e-05, "loss": 0.2911, "step": 192 }, { "epoch": 3.4887892376681613, "grad_norm": 1.2266180515289307, "learning_rate": 2.5116279069767445e-05, "loss": 0.2811, "step": 193 }, { "epoch": 3.506726457399103, "grad_norm": 1.2345128059387207, "learning_rate": 2.4186046511627908e-05, "loss": 0.2237, "step": 194 }, { "epoch": 3.524663677130045, "grad_norm": 1.3437424898147583, "learning_rate": 2.3255813953488374e-05, "loss": 0.283, "step": 195 }, { "epoch": 3.5426008968609866, "grad_norm": 1.4216517210006714, "learning_rate": 2.2325581395348837e-05, "loss": 0.2871, "step": 196 }, { "epoch": 3.5605381165919283, "grad_norm": 1.1113003492355347, "learning_rate": 2.1395348837209303e-05, "loss": 0.219, "step": 197 }, { "epoch": 3.57847533632287, "grad_norm": 1.592371940612793, "learning_rate": 2.046511627906977e-05, "loss": 0.3161, "step": 198 }, { "epoch": 3.596412556053812, "grad_norm": 1.2963297367095947, "learning_rate": 1.9534883720930235e-05, "loss": 0.2242, "step": 199 }, { "epoch": 3.6143497757847536, "grad_norm": 1.1588383913040161, "learning_rate": 1.8604651162790697e-05, "loss": 0.2079, "step": 200 }, { "epoch": 3.6322869955156953, "grad_norm": 1.2683604955673218, "learning_rate": 1.7674418604651163e-05, "loss": 0.2501, "step": 201 }, { "epoch": 3.6502242152466366, "grad_norm": 1.3655322790145874, "learning_rate": 1.674418604651163e-05, "loss": 0.2299, "step": 202 }, { "epoch": 3.6681614349775784, "grad_norm": 1.3018665313720703, "learning_rate": 1.5813953488372095e-05, "loss": 0.2148, "step": 203 }, { "epoch": 3.68609865470852, "grad_norm": 1.388330101966858, "learning_rate": 1.488372093023256e-05, "loss": 0.2512, "step": 204 }, { "epoch": 3.704035874439462, "grad_norm": 1.535142421722412, "learning_rate": 1.3953488372093024e-05, "loss": 0.2579, "step": 205 }, { "epoch": 3.7219730941704037, "grad_norm": 1.4287734031677246, "learning_rate": 1.3023255813953488e-05, "loss": 0.2708, "step": 206 }, { "epoch": 3.7399103139013454, "grad_norm": 1.5674840211868286, "learning_rate": 1.2093023255813954e-05, "loss": 0.3017, "step": 207 }, { "epoch": 3.7578475336322867, "grad_norm": 1.261733889579773, "learning_rate": 1.1162790697674418e-05, "loss": 0.2584, "step": 208 }, { "epoch": 3.7757847533632285, "grad_norm": 1.4881441593170166, "learning_rate": 1.0232558139534884e-05, "loss": 0.3086, "step": 209 }, { "epoch": 3.7937219730941703, "grad_norm": 1.1449949741363525, "learning_rate": 9.302325581395349e-06, "loss": 0.2277, "step": 210 }, { "epoch": 3.811659192825112, "grad_norm": 1.3948498964309692, "learning_rate": 8.372093023255815e-06, "loss": 0.2668, "step": 211 }, { "epoch": 3.8295964125560538, "grad_norm": 1.1462297439575195, "learning_rate": 7.44186046511628e-06, "loss": 0.2142, "step": 212 }, { "epoch": 3.8475336322869955, "grad_norm": 1.4967782497406006, "learning_rate": 6.511627906976744e-06, "loss": 0.275, "step": 213 }, { "epoch": 3.8654708520179373, "grad_norm": 1.3958649635314941, "learning_rate": 5.581395348837209e-06, "loss": 0.2833, "step": 214 }, { "epoch": 3.883408071748879, "grad_norm": 1.4644280672073364, "learning_rate": 4.651162790697674e-06, "loss": 0.3475, "step": 215 }, { "epoch": 3.901345291479821, "grad_norm": 1.3760302066802979, "learning_rate": 3.72093023255814e-06, "loss": 0.2892, "step": 216 }, { "epoch": 3.9192825112107625, "grad_norm": 1.320532202720642, "learning_rate": 2.7906976744186046e-06, "loss": 0.2542, "step": 217 }, { "epoch": 3.9372197309417043, "grad_norm": 1.1825841665267944, "learning_rate": 1.86046511627907e-06, "loss": 0.2306, "step": 218 }, { "epoch": 3.9551569506726456, "grad_norm": 1.3918488025665283, "learning_rate": 9.30232558139535e-07, "loss": 0.2958, "step": 219 }, { "epoch": 3.9730941704035874, "grad_norm": 1.3532480001449585, "learning_rate": 0.0, "loss": 0.2748, "step": 220 } ], "logging_steps": 1, "max_steps": 220, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9925730977234944.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }