{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982788296041308, "eval_steps": 500, "global_step": 290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.222222222222222e-06, "loss": 1.0458, "step": 1 }, { "epoch": 0.01, "learning_rate": 4.444444444444444e-06, "loss": 1.0521, "step": 2 }, { "epoch": 0.01, "learning_rate": 6.666666666666667e-06, "loss": 0.9549, "step": 3 }, { "epoch": 0.01, "learning_rate": 8.888888888888888e-06, "loss": 0.7315, "step": 4 }, { "epoch": 0.02, "learning_rate": 1.1111111111111113e-05, "loss": 0.6335, "step": 5 }, { "epoch": 0.02, "learning_rate": 1.3333333333333333e-05, "loss": 1.0116, "step": 6 }, { "epoch": 0.02, "learning_rate": 1.555555555555556e-05, "loss": 0.5621, "step": 7 }, { "epoch": 0.03, "learning_rate": 1.7777777777777777e-05, "loss": 0.7761, "step": 8 }, { "epoch": 0.03, "learning_rate": 2e-05, "loss": 0.7048, "step": 9 }, { "epoch": 0.03, "learning_rate": 1.9999375039475278e-05, "loss": 0.6919, "step": 10 }, { "epoch": 0.04, "learning_rate": 1.9997500236016233e-05, "loss": 0.6744, "step": 11 }, { "epoch": 0.04, "learning_rate": 1.9994375823958504e-05, "loss": 0.571, "step": 12 }, { "epoch": 0.04, "learning_rate": 1.9990002193828923e-05, "loss": 0.5849, "step": 13 }, { "epoch": 0.05, "learning_rate": 1.998437989229673e-05, "loss": 0.5264, "step": 14 }, { "epoch": 0.05, "learning_rate": 1.9977509622105233e-05, "loss": 0.5171, "step": 15 }, { "epoch": 0.06, "learning_rate": 1.9969392241983957e-05, "loss": 0.6064, "step": 16 }, { "epoch": 0.06, "learning_rate": 1.9960028766541336e-05, "loss": 1.419, "step": 17 }, { "epoch": 0.06, "learning_rate": 1.994942036613787e-05, "loss": 0.605, "step": 18 }, { "epoch": 0.07, "learning_rate": 1.9937568366739858e-05, "loss": 0.6079, "step": 19 }, { "epoch": 0.07, "learning_rate": 1.9924474249753656e-05, "loss": 0.6545, "step": 20 }, { "epoch": 0.07, "learning_rate": 1.9910139651840497e-05, "loss": 0.5648, "step": 21 }, { "epoch": 0.08, "learning_rate": 1.9894566364711965e-05, "loss": 0.4007, "step": 22 }, { "epoch": 0.08, "learning_rate": 1.9877756334905983e-05, "loss": 0.3545, "step": 23 }, { "epoch": 0.08, "learning_rate": 1.9859711663543573e-05, "loss": 0.3676, "step": 24 }, { "epoch": 0.09, "learning_rate": 1.9840434606066182e-05, "loss": 0.3657, "step": 25 }, { "epoch": 0.09, "learning_rate": 1.9819927571953804e-05, "loss": 0.2507, "step": 26 }, { "epoch": 0.09, "learning_rate": 1.9798193124423804e-05, "loss": 0.1892, "step": 27 }, { "epoch": 0.1, "learning_rate": 1.9775233980110524e-05, "loss": 0.2312, "step": 28 }, { "epoch": 0.1, "learning_rate": 1.9751053008725736e-05, "loss": 0.13, "step": 29 }, { "epoch": 0.1, "learning_rate": 1.9725653232699962e-05, "loss": 0.2087, "step": 30 }, { "epoch": 0.11, "learning_rate": 1.969903782680467e-05, "loss": 0.1707, "step": 31 }, { "epoch": 0.11, "learning_rate": 1.967121011775546e-05, "loss": 0.1929, "step": 32 }, { "epoch": 0.11, "learning_rate": 1.9642173583796265e-05, "loss": 0.7225, "step": 33 }, { "epoch": 0.12, "learning_rate": 1.961193185426459e-05, "loss": 0.1488, "step": 34 }, { "epoch": 0.12, "learning_rate": 1.958048870913786e-05, "loss": 0.2369, "step": 35 }, { "epoch": 0.12, "learning_rate": 1.9547848078560975e-05, "loss": 0.2356, "step": 36 }, { "epoch": 0.13, "learning_rate": 1.9514014042355057e-05, "loss": 0.3123, "step": 37 }, { "epoch": 0.13, "learning_rate": 1.9478990829507507e-05, "loss": 0.2355, "step": 38 }, { "epoch": 0.13, "learning_rate": 1.9442782817643425e-05, "loss": 0.2231, "step": 39 }, { "epoch": 0.14, "learning_rate": 1.9405394532478422e-05, "loss": 0.158, "step": 40 }, { "epoch": 0.14, "learning_rate": 1.9366830647252974e-05, "loss": 0.2163, "step": 41 }, { "epoch": 0.14, "learning_rate": 1.9327095982148258e-05, "loss": 0.2184, "step": 42 }, { "epoch": 0.15, "learning_rate": 1.928619550368371e-05, "loss": 0.1329, "step": 43 }, { "epoch": 0.15, "learning_rate": 1.9244134324096223e-05, "loss": 0.0903, "step": 44 }, { "epoch": 0.15, "learning_rate": 1.9200917700701176e-05, "loss": 0.134, "step": 45 }, { "epoch": 0.16, "learning_rate": 1.915655103523529e-05, "loss": 0.1293, "step": 46 }, { "epoch": 0.16, "learning_rate": 1.9111039873181478e-05, "loss": 0.0764, "step": 47 }, { "epoch": 0.17, "learning_rate": 1.9064389903075676e-05, "loss": 0.0527, "step": 48 }, { "epoch": 0.17, "learning_rate": 1.901660695579585e-05, "loss": 0.0677, "step": 49 }, { "epoch": 0.17, "learning_rate": 1.8967697003833156e-05, "loss": 0.0862, "step": 50 }, { "epoch": 0.18, "learning_rate": 1.8917666160545446e-05, "loss": 0.0737, "step": 51 }, { "epoch": 0.18, "learning_rate": 1.8866520679393127e-05, "loss": 0.028, "step": 52 }, { "epoch": 0.18, "learning_rate": 1.8814266953157557e-05, "loss": 0.0202, "step": 53 }, { "epoch": 0.19, "learning_rate": 1.876091151314196e-05, "loss": 0.0498, "step": 54 }, { "epoch": 0.19, "learning_rate": 1.8706461028355107e-05, "loss": 0.0222, "step": 55 }, { "epoch": 0.19, "learning_rate": 1.865092230467769e-05, "loss": 0.0122, "step": 56 }, { "epoch": 0.2, "learning_rate": 1.8594302284011704e-05, "loss": 0.0364, "step": 57 }, { "epoch": 0.2, "learning_rate": 1.85366080434127e-05, "loss": 0.0184, "step": 58 }, { "epoch": 0.2, "learning_rate": 1.8477846794205258e-05, "loss": 0.0379, "step": 59 }, { "epoch": 0.21, "learning_rate": 1.8418025881081612e-05, "loss": 0.0095, "step": 60 }, { "epoch": 0.21, "learning_rate": 1.8357152781183606e-05, "loss": 0.0144, "step": 61 }, { "epoch": 0.21, "learning_rate": 1.829523510316813e-05, "loss": 0.0103, "step": 62 }, { "epoch": 0.22, "learning_rate": 1.82322805862561e-05, "loss": 0.019, "step": 63 }, { "epoch": 0.22, "learning_rate": 1.8168297099265094e-05, "loss": 0.0846, "step": 64 }, { "epoch": 0.22, "learning_rate": 1.810329263962584e-05, "loss": 0.027, "step": 65 }, { "epoch": 0.23, "learning_rate": 1.803727533238257e-05, "loss": 0.0468, "step": 66 }, { "epoch": 0.23, "learning_rate": 1.7970253429177477e-05, "loss": 0.0446, "step": 67 }, { "epoch": 0.23, "learning_rate": 1.7902235307219333e-05, "loss": 0.0086, "step": 68 }, { "epoch": 0.24, "learning_rate": 1.7833229468236367e-05, "loss": 0.0284, "step": 69 }, { "epoch": 0.24, "learning_rate": 1.776324453741365e-05, "loss": 0.0302, "step": 70 }, { "epoch": 0.24, "learning_rate": 1.7692289262315e-05, "loss": 0.0113, "step": 71 }, { "epoch": 0.25, "learning_rate": 1.7620372511789607e-05, "loss": 0.0043, "step": 72 }, { "epoch": 0.25, "learning_rate": 1.75475032748635e-05, "loss": 0.0113, "step": 73 }, { "epoch": 0.25, "learning_rate": 1.747369065961599e-05, "loss": 0.0051, "step": 74 }, { "epoch": 0.26, "learning_rate": 1.7398943892041223e-05, "loss": 0.0087, "step": 75 }, { "epoch": 0.26, "learning_rate": 1.7323272314895022e-05, "loss": 0.0135, "step": 76 }, { "epoch": 0.27, "learning_rate": 1.7246685386527098e-05, "loss": 0.0077, "step": 77 }, { "epoch": 0.27, "learning_rate": 1.7169192679698837e-05, "loss": 0.0173, "step": 78 }, { "epoch": 0.27, "learning_rate": 1.7090803880386784e-05, "loss": 0.0023, "step": 79 }, { "epoch": 0.28, "learning_rate": 1.701152878657197e-05, "loss": 0.0216, "step": 80 }, { "epoch": 0.28, "learning_rate": 1.693137730701524e-05, "loss": 0.0072, "step": 81 }, { "epoch": 0.28, "learning_rate": 1.6850359460018737e-05, "loss": 0.0078, "step": 82 }, { "epoch": 0.29, "learning_rate": 1.6768485372173696e-05, "loss": 0.0146, "step": 83 }, { "epoch": 0.29, "learning_rate": 1.6685765277094702e-05, "loss": 0.0039, "step": 84 }, { "epoch": 0.29, "learning_rate": 1.6602209514140552e-05, "loss": 0.0218, "step": 85 }, { "epoch": 0.3, "learning_rate": 1.6517828527121942e-05, "loss": 0.0135, "step": 86 }, { "epoch": 0.3, "learning_rate": 1.6432632862996056e-05, "loss": 0.0046, "step": 87 }, { "epoch": 0.3, "learning_rate": 1.634663317054829e-05, "loss": 0.0108, "step": 88 }, { "epoch": 0.31, "learning_rate": 1.6259840199061215e-05, "loss": 0.0207, "step": 89 }, { "epoch": 0.31, "learning_rate": 1.617226479697105e-05, "loss": 0.0337, "step": 90 }, { "epoch": 0.31, "learning_rate": 1.608391791051163e-05, "loss": 0.0125, "step": 91 }, { "epoch": 0.32, "learning_rate": 1.599481058234626e-05, "loss": 0.01, "step": 92 }, { "epoch": 0.32, "learning_rate": 1.5904953950187458e-05, "loss": 0.0128, "step": 93 }, { "epoch": 0.32, "learning_rate": 1.5814359245404818e-05, "loss": 0.0059, "step": 94 }, { "epoch": 0.33, "learning_rate": 1.5723037791621193e-05, "loss": 0.0134, "step": 95 }, { "epoch": 0.33, "learning_rate": 1.563100100329731e-05, "loss": 0.0104, "step": 96 }, { "epoch": 0.33, "learning_rate": 1.5538260384305076e-05, "loss": 0.0145, "step": 97 }, { "epoch": 0.34, "learning_rate": 1.5444827526489675e-05, "loss": 0.0104, "step": 98 }, { "epoch": 0.34, "learning_rate": 1.5350714108220673e-05, "loss": 0.0105, "step": 99 }, { "epoch": 0.34, "learning_rate": 1.5255931892932333e-05, "loss": 0.0128, "step": 100 }, { "epoch": 0.35, "learning_rate": 1.5160492727653241e-05, "loss": 0.0093, "step": 101 }, { "epoch": 0.35, "learning_rate": 1.5064408541525573e-05, "loss": 0.0226, "step": 102 }, { "epoch": 0.35, "learning_rate": 1.4967691344313995e-05, "loss": 0.0089, "step": 103 }, { "epoch": 0.36, "learning_rate": 1.4870353224904572e-05, "loss": 0.0259, "step": 104 }, { "epoch": 0.36, "learning_rate": 1.4772406349793744e-05, "loss": 0.0189, "step": 105 }, { "epoch": 0.36, "learning_rate": 1.4673862961567602e-05, "loss": 0.0119, "step": 106 }, { "epoch": 0.37, "learning_rate": 1.457473537737167e-05, "loss": 0.0115, "step": 107 }, { "epoch": 0.37, "learning_rate": 1.4475035987371355e-05, "loss": 0.0027, "step": 108 }, { "epoch": 0.38, "learning_rate": 1.4374777253203273e-05, "loss": 0.008, "step": 109 }, { "epoch": 0.38, "learning_rate": 1.4273971706417648e-05, "loss": 0.0108, "step": 110 }, { "epoch": 0.38, "learning_rate": 1.4172631946911964e-05, "loss": 0.0054, "step": 111 }, { "epoch": 0.39, "learning_rate": 1.407077064135607e-05, "loss": 0.0064, "step": 112 }, { "epoch": 0.39, "learning_rate": 1.3968400521608969e-05, "loss": 0.0042, "step": 113 }, { "epoch": 0.39, "learning_rate": 1.3865534383127406e-05, "loss": 0.0101, "step": 114 }, { "epoch": 0.4, "learning_rate": 1.3762185083366557e-05, "loss": 0.0121, "step": 115 }, { "epoch": 0.4, "learning_rate": 1.3658365540172948e-05, "loss": 0.0021, "step": 116 }, { "epoch": 0.4, "learning_rate": 1.3554088730169814e-05, "loss": 0.0067, "step": 117 }, { "epoch": 0.41, "learning_rate": 1.3449367687135134e-05, "loss": 0.0303, "step": 118 }, { "epoch": 0.41, "learning_rate": 1.334421550037251e-05, "loss": 0.0012, "step": 119 }, { "epoch": 0.41, "learning_rate": 1.3238645313075104e-05, "loss": 0.0014, "step": 120 }, { "epoch": 0.42, "learning_rate": 1.313267032068285e-05, "loss": 0.0087, "step": 121 }, { "epoch": 0.42, "learning_rate": 1.3026303769233112e-05, "loss": 0.0086, "step": 122 }, { "epoch": 0.42, "learning_rate": 1.2919558953705055e-05, "loss": 0.0008, "step": 123 }, { "epoch": 0.43, "learning_rate": 1.2812449216357863e-05, "loss": 0.0064, "step": 124 }, { "epoch": 0.43, "learning_rate": 1.270498794506307e-05, "loss": 0.0101, "step": 125 }, { "epoch": 0.43, "learning_rate": 1.259718857163117e-05, "loss": 0.0168, "step": 126 }, { "epoch": 0.44, "learning_rate": 1.2489064570132764e-05, "loss": 0.013, "step": 127 }, { "epoch": 0.44, "learning_rate": 1.2380629455214392e-05, "loss": 0.0308, "step": 128 }, { "epoch": 0.44, "learning_rate": 1.2271896780409321e-05, "loss": 0.0053, "step": 129 }, { "epoch": 0.45, "learning_rate": 1.2162880136443447e-05, "loss": 0.0038, "step": 130 }, { "epoch": 0.45, "learning_rate": 1.2053593149536576e-05, "loss": 0.0014, "step": 131 }, { "epoch": 0.45, "learning_rate": 1.1944049479699244e-05, "loss": 0.0046, "step": 132 }, { "epoch": 0.46, "learning_rate": 1.1834262819025326e-05, "loss": 0.0309, "step": 133 }, { "epoch": 0.46, "learning_rate": 1.1724246889980638e-05, "loss": 0.0022, "step": 134 }, { "epoch": 0.46, "learning_rate": 1.1614015443687723e-05, "loss": 0.0189, "step": 135 }, { "epoch": 0.47, "learning_rate": 1.150358225820709e-05, "loss": 0.0115, "step": 136 }, { "epoch": 0.47, "learning_rate": 1.1392961136815046e-05, "loss": 0.0218, "step": 137 }, { "epoch": 0.48, "learning_rate": 1.1282165906278402e-05, "loss": 0.0017, "step": 138 }, { "epoch": 0.48, "learning_rate": 1.1171210415126248e-05, "loss": 0.0144, "step": 139 }, { "epoch": 0.48, "learning_rate": 1.1060108531918972e-05, "loss": 0.0081, "step": 140 }, { "epoch": 0.49, "learning_rate": 1.094887414351482e-05, "loss": 0.0043, "step": 141 }, { "epoch": 0.49, "learning_rate": 1.0837521153334143e-05, "loss": 0.0025, "step": 142 }, { "epoch": 0.49, "learning_rate": 1.0726063479621574e-05, "loss": 0.0041, "step": 143 }, { "epoch": 0.5, "learning_rate": 1.0614515053706367e-05, "loss": 0.002, "step": 144 }, { "epoch": 0.5, "learning_rate": 1.0502889818261075e-05, "loss": 0.0018, "step": 145 }, { "epoch": 0.5, "learning_rate": 1.0391201725558842e-05, "loss": 0.0008, "step": 146 }, { "epoch": 0.51, "learning_rate": 1.0279464735729472e-05, "loss": 0.0027, "step": 147 }, { "epoch": 0.51, "learning_rate": 1.0167692815014527e-05, "loss": 0.0062, "step": 148 }, { "epoch": 0.51, "learning_rate": 1.0055899934021649e-05, "loss": 0.0018, "step": 149 }, { "epoch": 0.52, "learning_rate": 9.944100065978351e-06, "loss": 0.001, "step": 150 }, { "epoch": 0.52, "learning_rate": 9.832307184985475e-06, "loss": 0.0033, "step": 151 }, { "epoch": 0.52, "learning_rate": 9.720535264270529e-06, "loss": 0.0048, "step": 152 }, { "epoch": 0.53, "learning_rate": 9.60879827444116e-06, "loss": 0.0244, "step": 153 }, { "epoch": 0.53, "learning_rate": 9.497110181738928e-06, "loss": 0.0004, "step": 154 }, { "epoch": 0.53, "learning_rate": 9.385484946293636e-06, "loss": 0.0018, "step": 155 }, { "epoch": 0.54, "learning_rate": 9.273936520378428e-06, "loss": 0.0015, "step": 156 }, { "epoch": 0.54, "learning_rate": 9.16247884666586e-06, "loss": 0.0008, "step": 157 }, { "epoch": 0.54, "learning_rate": 9.051125856485183e-06, "loss": 0.0049, "step": 158 }, { "epoch": 0.55, "learning_rate": 8.939891468081033e-06, "loss": 0.0224, "step": 159 }, { "epoch": 0.55, "learning_rate": 8.828789584873754e-06, "loss": 0.0282, "step": 160 }, { "epoch": 0.55, "learning_rate": 8.717834093721598e-06, "loss": 0.0015, "step": 161 }, { "epoch": 0.56, "learning_rate": 8.607038863184957e-06, "loss": 0.0009, "step": 162 }, { "epoch": 0.56, "learning_rate": 8.496417741792912e-06, "loss": 0.0034, "step": 163 }, { "epoch": 0.56, "learning_rate": 8.385984556312282e-06, "loss": 0.0103, "step": 164 }, { "epoch": 0.57, "learning_rate": 8.275753110019367e-06, "loss": 0.0058, "step": 165 }, { "epoch": 0.57, "learning_rate": 8.165737180974678e-06, "loss": 0.0177, "step": 166 }, { "epoch": 0.57, "learning_rate": 8.05595052030076e-06, "loss": 0.002, "step": 167 }, { "epoch": 0.58, "learning_rate": 7.94640685046343e-06, "loss": 0.0269, "step": 168 }, { "epoch": 0.58, "learning_rate": 7.837119863556554e-06, "loss": 0.0048, "step": 169 }, { "epoch": 0.59, "learning_rate": 7.72810321959068e-06, "loss": 0.0125, "step": 170 }, { "epoch": 0.59, "learning_rate": 7.619370544785608e-06, "loss": 0.0032, "step": 171 }, { "epoch": 0.59, "learning_rate": 7.510935429867237e-06, "loss": 0.0079, "step": 172 }, { "epoch": 0.6, "learning_rate": 7.402811428368832e-06, "loss": 0.0154, "step": 173 }, { "epoch": 0.6, "learning_rate": 7.295012054936934e-06, "loss": 0.0036, "step": 174 }, { "epoch": 0.6, "learning_rate": 7.187550783642141e-06, "loss": 0.0063, "step": 175 }, { "epoch": 0.61, "learning_rate": 7.080441046294948e-06, "loss": 0.0119, "step": 176 }, { "epoch": 0.61, "learning_rate": 6.973696230766891e-06, "loss": 0.0015, "step": 177 }, { "epoch": 0.61, "learning_rate": 6.8673296793171555e-06, "loss": 0.003, "step": 178 }, { "epoch": 0.62, "learning_rate": 6.761354686924895e-06, "loss": 0.0065, "step": 179 }, { "epoch": 0.62, "learning_rate": 6.655784499627491e-06, "loss": 0.0034, "step": 180 }, { "epoch": 0.62, "learning_rate": 6.550632312864869e-06, "loss": 0.0031, "step": 181 }, { "epoch": 0.63, "learning_rate": 6.445911269830189e-06, "loss": 0.0023, "step": 182 }, { "epoch": 0.63, "learning_rate": 6.341634459827053e-06, "loss": 0.0009, "step": 183 }, { "epoch": 0.63, "learning_rate": 6.237814916633444e-06, "loss": 0.003, "step": 184 }, { "epoch": 0.64, "learning_rate": 6.134465616872598e-06, "loss": 0.0036, "step": 185 }, { "epoch": 0.64, "learning_rate": 6.0315994783910345e-06, "loss": 0.0005, "step": 186 }, { "epoch": 0.64, "learning_rate": 5.929229358643932e-06, "loss": 0.0005, "step": 187 }, { "epoch": 0.65, "learning_rate": 5.827368053088043e-06, "loss": 0.0036, "step": 188 }, { "epoch": 0.65, "learning_rate": 5.726028293582355e-06, "loss": 0.0098, "step": 189 }, { "epoch": 0.65, "learning_rate": 5.62522274679673e-06, "loss": 0.0005, "step": 190 }, { "epoch": 0.66, "learning_rate": 5.524964012628648e-06, "loss": 0.0207, "step": 191 }, { "epoch": 0.66, "learning_rate": 5.42526462262833e-06, "loss": 0.0015, "step": 192 }, { "epoch": 0.66, "learning_rate": 5.326137038432399e-06, "loss": 0.0024, "step": 193 }, { "epoch": 0.67, "learning_rate": 5.227593650206258e-06, "loss": 0.0059, "step": 194 }, { "epoch": 0.67, "learning_rate": 5.129646775095432e-06, "loss": 0.0188, "step": 195 }, { "epoch": 0.67, "learning_rate": 5.032308655686011e-06, "loss": 0.0007, "step": 196 }, { "epoch": 0.68, "learning_rate": 4.935591458474433e-06, "loss": 0.0056, "step": 197 }, { "epoch": 0.68, "learning_rate": 4.8395072723467585e-06, "loss": 0.019, "step": 198 }, { "epoch": 0.69, "learning_rate": 4.74406810706767e-06, "loss": 0.0081, "step": 199 }, { "epoch": 0.69, "learning_rate": 4.649285891779327e-06, "loss": 0.0308, "step": 200 }, { "epoch": 0.69, "learning_rate": 4.5551724735103285e-06, "loss": 0.0108, "step": 201 }, { "epoch": 0.7, "learning_rate": 4.461739615694929e-06, "loss": 0.0028, "step": 202 }, { "epoch": 0.7, "learning_rate": 4.368998996702694e-06, "loss": 0.0044, "step": 203 }, { "epoch": 0.7, "learning_rate": 4.276962208378811e-06, "loss": 0.0014, "step": 204 }, { "epoch": 0.71, "learning_rate": 4.185640754595183e-06, "loss": 0.004, "step": 205 }, { "epoch": 0.71, "learning_rate": 4.095046049812545e-06, "loss": 0.0007, "step": 206 }, { "epoch": 0.71, "learning_rate": 4.005189417653743e-06, "loss": 0.0236, "step": 207 }, { "epoch": 0.72, "learning_rate": 3.916082089488372e-06, "loss": 0.0132, "step": 208 }, { "epoch": 0.72, "learning_rate": 3.827735203028953e-06, "loss": 0.0031, "step": 209 }, { "epoch": 0.72, "learning_rate": 3.740159800938784e-06, "loss": 0.01, "step": 210 }, { "epoch": 0.73, "learning_rate": 3.6533668294517154e-06, "loss": 0.0053, "step": 211 }, { "epoch": 0.73, "learning_rate": 3.5673671370039464e-06, "loss": 0.0009, "step": 212 }, { "epoch": 0.73, "learning_rate": 3.482171472878062e-06, "loss": 0.0005, "step": 213 }, { "epoch": 0.74, "learning_rate": 3.39779048585945e-06, "loss": 0.0075, "step": 214 }, { "epoch": 0.74, "learning_rate": 3.314234722905302e-06, "loss": 0.0083, "step": 215 }, { "epoch": 0.74, "learning_rate": 3.2315146278263053e-06, "loss": 0.0097, "step": 216 }, { "epoch": 0.75, "learning_rate": 3.149640539981267e-06, "loss": 0.0015, "step": 217 }, { "epoch": 0.75, "learning_rate": 3.0686226929847617e-06, "loss": 0.0013, "step": 218 }, { "epoch": 0.75, "learning_rate": 2.9884712134280324e-06, "loss": 0.0046, "step": 219 }, { "epoch": 0.76, "learning_rate": 2.909196119613218e-06, "loss": 0.0017, "step": 220 }, { "epoch": 0.76, "learning_rate": 2.8308073203011667e-06, "loss": 0.0015, "step": 221 }, { "epoch": 0.76, "learning_rate": 2.753314613472906e-06, "loss": 0.0005, "step": 222 }, { "epoch": 0.77, "learning_rate": 2.6767276851049818e-06, "loss": 0.0006, "step": 223 }, { "epoch": 0.77, "learning_rate": 2.6010561079587817e-06, "loss": 0.0037, "step": 224 }, { "epoch": 0.77, "learning_rate": 2.5263093403840145e-06, "loss": 0.0051, "step": 225 }, { "epoch": 0.78, "learning_rate": 2.452496725136503e-06, "loss": 0.0045, "step": 226 }, { "epoch": 0.78, "learning_rate": 2.3796274882103964e-06, "loss": 0.0006, "step": 227 }, { "epoch": 0.78, "learning_rate": 2.3077107376850005e-06, "loss": 0.0014, "step": 228 }, { "epoch": 0.79, "learning_rate": 2.2367554625863496e-06, "loss": 0.0024, "step": 229 }, { "epoch": 0.79, "learning_rate": 2.1667705317636333e-06, "loss": 0.0036, "step": 230 }, { "epoch": 0.8, "learning_rate": 2.0977646927806682e-06, "loss": 0.0096, "step": 231 }, { "epoch": 0.8, "learning_rate": 2.029746570822524e-06, "loss": 0.0004, "step": 232 }, { "epoch": 0.8, "learning_rate": 1.9627246676174363e-06, "loss": 0.0075, "step": 233 }, { "epoch": 0.81, "learning_rate": 1.896707360374167e-06, "loss": 0.0008, "step": 234 }, { "epoch": 0.81, "learning_rate": 1.8317029007349086e-06, "loss": 0.0009, "step": 235 }, { "epoch": 0.81, "learning_rate": 1.7677194137439036e-06, "loss": 0.0005, "step": 236 }, { "epoch": 0.82, "learning_rate": 1.7047648968318697e-06, "loss": 0.0123, "step": 237 }, { "epoch": 0.82, "learning_rate": 1.642847218816398e-06, "loss": 0.0054, "step": 238 }, { "epoch": 0.82, "learning_rate": 1.5819741189183902e-06, "loss": 0.0005, "step": 239 }, { "epoch": 0.83, "learning_rate": 1.522153205794742e-06, "loss": 0.0075, "step": 240 }, { "epoch": 0.83, "learning_rate": 1.4633919565873033e-06, "loss": 0.011, "step": 241 }, { "epoch": 0.83, "learning_rate": 1.4056977159883011e-06, "loss": 0.0016, "step": 242 }, { "epoch": 0.84, "learning_rate": 1.3490776953223107e-06, "loss": 0.0046, "step": 243 }, { "epoch": 0.84, "learning_rate": 1.2935389716448976e-06, "loss": 0.0002, "step": 244 }, { "epoch": 0.84, "learning_rate": 1.23908848685804e-06, "loss": 0.0036, "step": 245 }, { "epoch": 0.85, "learning_rate": 1.1857330468424466e-06, "loss": 0.0028, "step": 246 }, { "epoch": 0.85, "learning_rate": 1.1334793206068739e-06, "loss": 0.0011, "step": 247 }, { "epoch": 0.85, "learning_rate": 1.082333839454559e-06, "loss": 0.0013, "step": 248 }, { "epoch": 0.86, "learning_rate": 1.0323029961668463e-06, "loss": 0.0005, "step": 249 }, { "epoch": 0.86, "learning_rate": 9.833930442041506e-07, "loss": 0.0089, "step": 250 }, { "epoch": 0.86, "learning_rate": 9.356100969243231e-07, "loss": 0.0011, "step": 251 }, { "epoch": 0.87, "learning_rate": 8.889601268185233e-07, "loss": 0.0014, "step": 252 }, { "epoch": 0.87, "learning_rate": 8.434489647647093e-07, "loss": 0.0003, "step": 253 }, { "epoch": 0.87, "learning_rate": 7.990822992988267e-07, "loss": 0.0011, "step": 254 }, { "epoch": 0.88, "learning_rate": 7.558656759037796e-07, "loss": 0.0039, "step": 255 }, { "epoch": 0.88, "learning_rate": 7.13804496316296e-07, "loss": 0.0037, "step": 256 }, { "epoch": 0.88, "learning_rate": 6.729040178517454e-07, "loss": 0.0031, "step": 257 }, { "epoch": 0.89, "learning_rate": 6.331693527470306e-07, "loss": 0.0006, "step": 258 }, { "epoch": 0.89, "learning_rate": 5.946054675215785e-07, "loss": 0.0036, "step": 259 }, { "epoch": 0.9, "learning_rate": 5.572171823565797e-07, "loss": 0.0011, "step": 260 }, { "epoch": 0.9, "learning_rate": 5.210091704924947e-07, "loss": 0.001, "step": 261 }, { "epoch": 0.9, "learning_rate": 4.859859576449444e-07, "loss": 0.0005, "step": 262 }, { "epoch": 0.91, "learning_rate": 4.5215192143902577e-07, "loss": 0.0012, "step": 263 }, { "epoch": 0.91, "learning_rate": 4.1951129086214015e-07, "loss": 0.0016, "step": 264 }, { "epoch": 0.91, "learning_rate": 3.8806814573541185e-07, "loss": 0.0038, "step": 265 }, { "epoch": 0.92, "learning_rate": 3.578264162037348e-07, "loss": 0.0019, "step": 266 }, { "epoch": 0.92, "learning_rate": 3.2878988224454346e-07, "loss": 0.0059, "step": 267 }, { "epoch": 0.92, "learning_rate": 3.0096217319533386e-07, "loss": 0.0044, "step": 268 }, { "epoch": 0.93, "learning_rate": 2.7434676730003886e-07, "loss": 0.0026, "step": 269 }, { "epoch": 0.93, "learning_rate": 2.489469912742637e-07, "loss": 0.0052, "step": 270 }, { "epoch": 0.93, "learning_rate": 2.2476601988947965e-07, "loss": 0.0029, "step": 271 }, { "epoch": 0.94, "learning_rate": 2.0180687557619816e-07, "loss": 0.0003, "step": 272 }, { "epoch": 0.94, "learning_rate": 1.800724280461963e-07, "loss": 0.0038, "step": 273 }, { "epoch": 0.94, "learning_rate": 1.5956539393382043e-07, "loss": 0.0036, "step": 274 }, { "epoch": 0.95, "learning_rate": 1.4028833645643113e-07, "loss": 0.0029, "step": 275 }, { "epoch": 0.95, "learning_rate": 1.2224366509401732e-07, "loss": 0.0004, "step": 276 }, { "epoch": 0.95, "learning_rate": 1.0543363528803696e-07, "loss": 0.0007, "step": 277 }, { "epoch": 0.96, "learning_rate": 8.986034815950173e-08, "loss": 0.0004, "step": 278 }, { "epoch": 0.96, "learning_rate": 7.55257502463469e-08, "loss": 0.0021, "step": 279 }, { "epoch": 0.96, "learning_rate": 6.243163326014268e-08, "loss": 0.0017, "step": 280 }, { "epoch": 0.97, "learning_rate": 5.057963386213116e-08, "loss": 0.0013, "step": 281 }, { "epoch": 0.97, "learning_rate": 3.9971233458665495e-08, "loss": 0.0029, "step": 282 }, { "epoch": 0.97, "learning_rate": 3.0607758016043546e-08, "loss": 0.0016, "step": 283 }, { "epoch": 0.98, "learning_rate": 2.2490377894768266e-08, "loss": 0.0022, "step": 284 }, { "epoch": 0.98, "learning_rate": 1.562010770326916e-08, "loss": 0.0044, "step": 285 }, { "epoch": 0.98, "learning_rate": 9.99780617107815e-09, "loss": 0.0093, "step": 286 }, { "epoch": 0.99, "learning_rate": 5.6241760414987856e-09, "loss": 0.0016, "step": 287 }, { "epoch": 0.99, "learning_rate": 2.4997639837687217e-09, "loss": 0.0003, "step": 288 }, { "epoch": 0.99, "learning_rate": 6.24960524725493e-10, "loss": 0.0004, "step": 289 }, { "epoch": 1.0, "learning_rate": 0.0, "loss": 0.0017, "step": 290 }, { "epoch": 1.0, "step": 290, "total_flos": 135956826390528.0, "train_loss": 0.0826369430505518, "train_runtime": 7100.2656, "train_samples_per_second": 5.237, "train_steps_per_second": 0.041 } ], "logging_steps": 1.0, "max_steps": 290, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "total_flos": 135956826390528.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }