{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "global_step": 201, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 5.714285714285714e-05, "loss": 1.5581, "step": 1 }, { "epoch": 0.03, "learning_rate": 0.00011428571428571428, "loss": 1.5618, "step": 2 }, { "epoch": 0.04, "learning_rate": 0.00017142857142857143, "loss": 1.4337, "step": 3 }, { "epoch": 0.06, "learning_rate": 0.00022857142857142857, "loss": 1.2552, "step": 4 }, { "epoch": 0.07, "learning_rate": 0.00028571428571428574, "loss": 1.2225, "step": 5 }, { "epoch": 0.09, "learning_rate": 0.00034285714285714285, "loss": 1.1791, "step": 6 }, { "epoch": 0.1, "learning_rate": 0.0004, "loss": 1.1597, "step": 7 }, { "epoch": 0.12, "learning_rate": 0.00039997377673312647, "loss": 1.1362, "step": 8 }, { "epoch": 0.13, "learning_rate": 0.00039989511380910303, "loss": 1.1049, "step": 9 }, { "epoch": 0.15, "learning_rate": 0.0003997640318559182, "loss": 1.0949, "step": 10 }, { "epoch": 0.16, "learning_rate": 0.00039958056524754237, "loss": 1.093, "step": 11 }, { "epoch": 0.18, "learning_rate": 0.0003993447620949139, "loss": 1.0893, "step": 12 }, { "epoch": 0.19, "learning_rate": 0.0003990566842333228, "loss": 1.1027, "step": 13 }, { "epoch": 0.21, "learning_rate": 0.00039871640720619554, "loss": 1.0704, "step": 14 }, { "epoch": 0.22, "learning_rate": 0.0003983240202452851, "loss": 1.0495, "step": 15 }, { "epoch": 0.24, "learning_rate": 0.00039787962624727123, "loss": 1.0833, "step": 16 }, { "epoch": 0.25, "learning_rate": 0.00039738334174677813, "loss": 1.0664, "step": 17 }, { "epoch": 0.27, "learning_rate": 0.0003968352968858149, "loss": 1.0566, "step": 18 }, { "epoch": 0.28, "learning_rate": 0.0003962356353796478, "loss": 1.0466, "step": 19 }, { "epoch": 0.3, "learning_rate": 0.00039558451447911414, "loss": 1.0814, "step": 20 }, { "epoch": 0.31, "learning_rate": 0.00039488210492938527, "loss": 1.0296, "step": 21 }, { "epoch": 0.33, "learning_rate": 0.00039412859092519185, "loss": 1.0381, "step": 22 }, { "epoch": 0.34, "learning_rate": 0.00039332417006252224, "loss": 1.0213, "step": 23 }, { "epoch": 0.36, "learning_rate": 0.0003924690532868061, "loss": 1.0226, "step": 24 }, { "epoch": 0.37, "learning_rate": 0.00039156346483759744, "loss": 1.061, "step": 25 }, { "epoch": 0.39, "learning_rate": 0.0003906076421897722, "loss": 1.059, "step": 26 }, { "epoch": 0.4, "learning_rate": 0.0003896018359912541, "loss": 1.0453, "step": 27 }, { "epoch": 0.42, "learning_rate": 0.0003885463099972869, "loss": 1.0195, "step": 28 }, { "epoch": 0.43, "learning_rate": 0.00038744134100126876, "loss": 1.0416, "step": 29 }, { "epoch": 0.45, "learning_rate": 0.0003862872187621685, "loss": 1.0315, "step": 30 }, { "epoch": 0.46, "learning_rate": 0.0003850842459285409, "loss": 1.0249, "step": 31 }, { "epoch": 0.48, "learning_rate": 0.00038383273795916244, "loss": 1.0359, "step": 32 }, { "epoch": 0.49, "learning_rate": 0.00038253302304030806, "loss": 1.0188, "step": 33 }, { "epoch": 0.51, "learning_rate": 0.0003811854419996894, "loss": 1.0426, "step": 34 }, { "epoch": 0.52, "learning_rate": 0.0003797903482170791, "loss": 1.0243, "step": 35 }, { "epoch": 0.54, "learning_rate": 0.0003783481075316429, "loss": 1.0176, "step": 36 }, { "epoch": 0.55, "learning_rate": 0.00037685909814600473, "loss": 1.0412, "step": 37 }, { "epoch": 0.57, "learning_rate": 0.0003753237105270696, "loss": 1.0295, "step": 38 }, { "epoch": 0.58, "learning_rate": 0.00037374234730363023, "loss": 1.0164, "step": 39 }, { "epoch": 0.6, "learning_rate": 0.00037211542316078507, "loss": 1.0196, "step": 40 }, { "epoch": 0.61, "learning_rate": 0.0003704433647311939, "loss": 1.0196, "step": 41 }, { "epoch": 0.63, "learning_rate": 0.00036872661048320093, "loss": 1.0177, "step": 42 }, { "epoch": 0.64, "learning_rate": 0.00036696561060585426, "loss": 1.0258, "step": 43 }, { "epoch": 0.66, "learning_rate": 0.0003651608268908513, "loss": 1.0212, "step": 44 }, { "epoch": 0.67, "learning_rate": 0.0003633127326114422, "loss": 1.0448, "step": 45 }, { "epoch": 0.69, "learning_rate": 0.00036142181239832185, "loss": 0.992, "step": 46 }, { "epoch": 0.7, "learning_rate": 0.00035948856211254416, "loss": 0.9978, "step": 47 }, { "epoch": 0.72, "learning_rate": 0.00035751348871549093, "loss": 1.0349, "step": 48 }, { "epoch": 0.73, "learning_rate": 0.0003554971101359299, "loss": 1.0298, "step": 49 }, { "epoch": 0.75, "learning_rate": 0.00035343995513419725, "loss": 1.0317, "step": 50 }, { "epoch": 0.76, "learning_rate": 0.00035134256316353906, "loss": 1.0056, "step": 51 }, { "epoch": 0.78, "learning_rate": 0.0003492054842286492, "loss": 1.0154, "step": 52 }, { "epoch": 0.79, "learning_rate": 0.0003470292787414401, "loss": 1.0194, "step": 53 }, { "epoch": 0.81, "learning_rate": 0.00034481451737408437, "loss": 0.9889, "step": 54 }, { "epoch": 0.82, "learning_rate": 0.0003425617809093659, "loss": 1.0256, "step": 55 }, { "epoch": 0.84, "learning_rate": 0.00034027166008837985, "loss": 0.9959, "step": 56 }, { "epoch": 0.85, "learning_rate": 0.0003379447554556209, "loss": 1.026, "step": 57 }, { "epoch": 0.87, "learning_rate": 0.00033558167720150066, "loss": 1.0113, "step": 58 }, { "epoch": 0.88, "learning_rate": 0.00033318304500233625, "loss": 1.0133, "step": 59 }, { "epoch": 0.9, "learning_rate": 0.00033074948785785055, "loss": 1.0133, "step": 60 }, { "epoch": 0.91, "learning_rate": 0.000328281643926228, "loss": 1.0156, "step": 61 }, { "epoch": 0.93, "learning_rate": 0.0003257801603567689, "loss": 1.0207, "step": 62 }, { "epoch": 0.94, "learning_rate": 0.0003232456931201855, "loss": 0.9841, "step": 63 }, { "epoch": 0.96, "learning_rate": 0.00032067890683658496, "loss": 1.0047, "step": 64 }, { "epoch": 0.97, "learning_rate": 0.00031808047460118457, "loss": 1.0015, "step": 65 }, { "epoch": 0.99, "learning_rate": 0.00031545107780780394, "loss": 0.9785, "step": 66 }, { "epoch": 1.0, "learning_rate": 0.0003127914059701813, "loss": 1.0009, "step": 67 }, { "epoch": 1.01, "learning_rate": 0.0003101021565411607, "loss": 0.8419, "step": 68 }, { "epoch": 1.03, "learning_rate": 0.0003073840347297968, "loss": 0.8403, "step": 69 }, { "epoch": 1.04, "learning_rate": 0.000304637753316426, "loss": 0.8149, "step": 70 }, { "epoch": 1.06, "learning_rate": 0.00030186403246575263, "loss": 0.8348, "step": 71 }, { "epoch": 1.07, "learning_rate": 0.00029906359953799756, "loss": 0.8394, "step": 72 }, { "epoch": 1.09, "learning_rate": 0.00029623718889816103, "loss": 0.8091, "step": 73 }, { "epoch": 1.1, "learning_rate": 0.0002933855417234481, "loss": 0.8093, "step": 74 }, { "epoch": 1.12, "learning_rate": 0.00029050940580890785, "loss": 0.7727, "step": 75 }, { "epoch": 1.13, "learning_rate": 0.0002876095353713365, "loss": 0.8193, "step": 76 }, { "epoch": 1.15, "learning_rate": 0.0002846866908514981, "loss": 0.8253, "step": 77 }, { "epoch": 1.16, "learning_rate": 0.0002817416387147113, "loss": 0.8263, "step": 78 }, { "epoch": 1.18, "learning_rate": 0.00027877515124985746, "loss": 0.8086, "step": 79 }, { "epoch": 1.19, "learning_rate": 0.00027578800636686137, "loss": 0.8282, "step": 80 }, { "epoch": 1.21, "learning_rate": 0.0002727809873926975, "loss": 0.7946, "step": 81 }, { "epoch": 1.22, "learning_rate": 0.00026975488286597643, "loss": 0.7947, "step": 82 }, { "epoch": 1.24, "learning_rate": 0.00026671048633016415, "loss": 0.8287, "step": 83 }, { "epoch": 1.25, "learning_rate": 0.0002636485961254888, "loss": 0.8201, "step": 84 }, { "epoch": 1.27, "learning_rate": 0.0002605700151795901, "loss": 0.7945, "step": 85 }, { "epoch": 1.28, "learning_rate": 0.00025747555079696566, "loss": 0.7827, "step": 86 }, { "epoch": 1.3, "learning_rate": 0.0002543660144472686, "loss": 0.8353, "step": 87 }, { "epoch": 1.31, "learning_rate": 0.00025124222155251444, "loss": 0.8163, "step": 88 }, { "epoch": 1.33, "learning_rate": 0.00024810499127325073, "loss": 0.7853, "step": 89 }, { "epoch": 1.34, "learning_rate": 0.00024495514629374593, "loss": 0.7858, "step": 90 }, { "epoch": 1.36, "learning_rate": 0.00024179351260625507, "loss": 0.7868, "step": 91 }, { "epoch": 1.37, "learning_rate": 0.00023862091929441762, "loss": 0.8161, "step": 92 }, { "epoch": 1.39, "learning_rate": 0.0002354381983158446, "loss": 0.7799, "step": 93 }, { "epoch": 1.4, "learning_rate": 0.00023224618428395198, "loss": 0.8114, "step": 94 }, { "epoch": 1.42, "learning_rate": 0.00022904571424909808, "loss": 0.8078, "step": 95 }, { "epoch": 1.43, "learning_rate": 0.0002258376274790813, "loss": 0.8077, "step": 96 }, { "epoch": 1.45, "learning_rate": 0.0002226227652390569, "loss": 0.7888, "step": 97 }, { "epoch": 1.46, "learning_rate": 0.00021940197057092962, "loss": 0.8197, "step": 98 }, { "epoch": 1.48, "learning_rate": 0.00021617608807228086, "loss": 0.8234, "step": 99 }, { "epoch": 1.49, "learning_rate": 0.00021294596367488718, "loss": 0.8038, "step": 100 }, { "epoch": 1.51, "learning_rate": 0.00020971244442288968, "loss": 0.8075, "step": 101 }, { "epoch": 1.52, "learning_rate": 0.00020647637825067122, "loss": 0.7947, "step": 102 }, { "epoch": 1.54, "learning_rate": 0.00020323861376050033, "loss": 0.7924, "step": 103 }, { "epoch": 1.55, "learning_rate": 0.0002, "loss": 0.7874, "step": 104 }, { "epoch": 1.57, "learning_rate": 0.00019676138623949972, "loss": 0.8004, "step": 105 }, { "epoch": 1.58, "learning_rate": 0.00019352362174932889, "loss": 0.7932, "step": 106 }, { "epoch": 1.6, "learning_rate": 0.00019028755557711042, "loss": 0.7869, "step": 107 }, { "epoch": 1.61, "learning_rate": 0.00018705403632511287, "loss": 0.7622, "step": 108 }, { "epoch": 1.63, "learning_rate": 0.0001838239119277192, "loss": 0.7618, "step": 109 }, { "epoch": 1.64, "learning_rate": 0.0001805980294290704, "loss": 0.7826, "step": 110 }, { "epoch": 1.66, "learning_rate": 0.00017737723476094315, "loss": 0.7971, "step": 111 }, { "epoch": 1.67, "learning_rate": 0.0001741623725209188, "loss": 0.8068, "step": 112 }, { "epoch": 1.69, "learning_rate": 0.000170954285750902, "loss": 0.7831, "step": 113 }, { "epoch": 1.7, "learning_rate": 0.00016775381571604804, "loss": 0.7988, "step": 114 }, { "epoch": 1.72, "learning_rate": 0.00016456180168415547, "loss": 0.7912, "step": 115 }, { "epoch": 1.73, "learning_rate": 0.00016137908070558242, "loss": 0.7737, "step": 116 }, { "epoch": 1.75, "learning_rate": 0.000158206487393745, "loss": 0.8103, "step": 117 }, { "epoch": 1.76, "learning_rate": 0.0001550448537062542, "loss": 0.7843, "step": 118 }, { "epoch": 1.78, "learning_rate": 0.00015189500872674934, "loss": 0.7957, "step": 119 }, { "epoch": 1.79, "learning_rate": 0.00014875777844748552, "loss": 0.7912, "step": 120 }, { "epoch": 1.81, "learning_rate": 0.00014563398555273143, "loss": 0.79, "step": 121 }, { "epoch": 1.82, "learning_rate": 0.00014252444920303438, "loss": 0.8032, "step": 122 }, { "epoch": 1.84, "learning_rate": 0.0001394299848204099, "loss": 0.8148, "step": 123 }, { "epoch": 1.85, "learning_rate": 0.00013635140387451128, "loss": 0.7912, "step": 124 }, { "epoch": 1.87, "learning_rate": 0.00013328951366983592, "loss": 0.7999, "step": 125 }, { "epoch": 1.88, "learning_rate": 0.00013024511713402353, "loss": 0.7916, "step": 126 }, { "epoch": 1.9, "learning_rate": 0.00012721901260730252, "loss": 0.7848, "step": 127 }, { "epoch": 1.91, "learning_rate": 0.00012421199363313865, "loss": 0.7869, "step": 128 }, { "epoch": 1.93, "learning_rate": 0.00012122484875014261, "loss": 0.7495, "step": 129 }, { "epoch": 1.94, "learning_rate": 0.00011825836128528883, "loss": 0.761, "step": 130 }, { "epoch": 1.96, "learning_rate": 0.00011531330914850204, "loss": 0.7943, "step": 131 }, { "epoch": 1.97, "learning_rate": 0.00011239046462866353, "loss": 0.8015, "step": 132 }, { "epoch": 1.99, "learning_rate": 0.00010949059419109224, "loss": 0.7766, "step": 133 }, { "epoch": 2.0, "learning_rate": 0.00010661445827655187, "loss": 0.7663, "step": 134 }, { "epoch": 2.01, "learning_rate": 0.00010376281110183899, "loss": 0.5708, "step": 135 }, { "epoch": 2.03, "learning_rate": 0.00010093640046200257, "loss": 0.5661, "step": 136 }, { "epoch": 2.04, "learning_rate": 9.813596753424747e-05, "loss": 0.5428, "step": 137 }, { "epoch": 2.06, "learning_rate": 9.536224668357398e-05, "loss": 0.5465, "step": 138 }, { "epoch": 2.07, "learning_rate": 9.261596527020324e-05, "loss": 0.5272, "step": 139 }, { "epoch": 2.09, "learning_rate": 8.98978434588393e-05, "loss": 0.5375, "step": 140 }, { "epoch": 2.1, "learning_rate": 8.720859402981868e-05, "loss": 0.5253, "step": 141 }, { "epoch": 2.12, "learning_rate": 8.454892219219617e-05, "loss": 0.5265, "step": 142 }, { "epoch": 2.13, "learning_rate": 8.191952539881553e-05, "loss": 0.4915, "step": 143 }, { "epoch": 2.15, "learning_rate": 7.932109316341507e-05, "loss": 0.519, "step": 144 }, { "epoch": 2.16, "learning_rate": 7.675430687981453e-05, "loss": 0.5061, "step": 145 }, { "epoch": 2.18, "learning_rate": 7.42198396432311e-05, "loss": 0.5109, "step": 146 }, { "epoch": 2.19, "learning_rate": 7.171835607377206e-05, "loss": 0.5141, "step": 147 }, { "epoch": 2.21, "learning_rate": 6.925051214214955e-05, "loss": 0.4993, "step": 148 }, { "epoch": 2.22, "learning_rate": 6.681695499766383e-05, "loss": 0.4964, "step": 149 }, { "epoch": 2.24, "learning_rate": 6.441832279849935e-05, "loss": 0.4871, "step": 150 }, { "epoch": 2.25, "learning_rate": 6.205524454437915e-05, "loss": 0.4901, "step": 151 }, { "epoch": 2.27, "learning_rate": 5.972833991162017e-05, "loss": 0.4883, "step": 152 }, { "epoch": 2.28, "learning_rate": 5.7438219090634206e-05, "loss": 0.4976, "step": 153 }, { "epoch": 2.3, "learning_rate": 5.518548262591574e-05, "loss": 0.5058, "step": 154 }, { "epoch": 2.31, "learning_rate": 5.297072125855997e-05, "loss": 0.4878, "step": 155 }, { "epoch": 2.33, "learning_rate": 5.0794515771350795e-05, "loss": 0.4967, "step": 156 }, { "epoch": 2.34, "learning_rate": 4.865743683646094e-05, "loss": 0.478, "step": 157 }, { "epoch": 2.36, "learning_rate": 4.6560044865802766e-05, "loss": 0.5068, "step": 158 }, { "epoch": 2.37, "learning_rate": 4.450288986407019e-05, "loss": 0.4856, "step": 159 }, { "epoch": 2.39, "learning_rate": 4.2486511284509155e-05, "loss": 0.4929, "step": 160 }, { "epoch": 2.4, "learning_rate": 4.051143788745588e-05, "loss": 0.4845, "step": 161 }, { "epoch": 2.42, "learning_rate": 3.857818760167813e-05, "loss": 0.5001, "step": 162 }, { "epoch": 2.43, "learning_rate": 3.668726738855779e-05, "loss": 0.4841, "step": 163 }, { "epoch": 2.45, "learning_rate": 3.483917310914872e-05, "loss": 0.4843, "step": 164 }, { "epoch": 2.46, "learning_rate": 3.30343893941458e-05, "loss": 0.4976, "step": 165 }, { "epoch": 2.48, "learning_rate": 3.127338951679912e-05, "loss": 0.4971, "step": 166 }, { "epoch": 2.49, "learning_rate": 2.9556635268806167e-05, "loss": 0.5043, "step": 167 }, { "epoch": 2.51, "learning_rate": 2.788457683921495e-05, "loss": 0.4812, "step": 168 }, { "epoch": 2.52, "learning_rate": 2.6257652696369774e-05, "loss": 0.4702, "step": 169 }, { "epoch": 2.54, "learning_rate": 2.467628947293048e-05, "loss": 0.5025, "step": 170 }, { "epoch": 2.55, "learning_rate": 2.314090185399531e-05, "loss": 0.4985, "step": 171 }, { "epoch": 2.57, "learning_rate": 2.1651892468357148e-05, "loss": 0.5118, "step": 172 }, { "epoch": 2.58, "learning_rate": 2.020965178292096e-05, "loss": 0.489, "step": 173 }, { "epoch": 2.6, "learning_rate": 1.8814558000310623e-05, "loss": 0.4698, "step": 174 }, { "epoch": 2.61, "learning_rate": 1.7466976959691993e-05, "loss": 0.4908, "step": 175 }, { "epoch": 2.63, "learning_rate": 1.6167262040837582e-05, "loss": 0.4752, "step": 176 }, { "epoch": 2.64, "learning_rate": 1.4915754071459175e-05, "loss": 0.4993, "step": 177 }, { "epoch": 2.66, "learning_rate": 1.371278123783155e-05, "loss": 0.4926, "step": 178 }, { "epoch": 2.67, "learning_rate": 1.2558658998731299e-05, "loss": 0.4845, "step": 179 }, { "epoch": 2.69, "learning_rate": 1.1453690002713146e-05, "loss": 0.4724, "step": 180 }, { "epoch": 2.7, "learning_rate": 1.0398164008745915e-05, "loss": 0.4993, "step": 181 }, { "epoch": 2.72, "learning_rate": 9.392357810227825e-06, "loss": 0.5016, "step": 182 }, { "epoch": 2.73, "learning_rate": 8.436535162402593e-06, "loss": 0.4747, "step": 183 }, { "epoch": 2.75, "learning_rate": 7.530946713193965e-06, "loss": 0.4979, "step": 184 }, { "epoch": 2.76, "learning_rate": 6.675829937477773e-06, "loss": 0.4925, "step": 185 }, { "epoch": 2.78, "learning_rate": 5.871409074808165e-06, "loss": 0.4798, "step": 186 }, { "epoch": 2.79, "learning_rate": 5.1178950706147975e-06, "loss": 0.4896, "step": 187 }, { "epoch": 2.81, "learning_rate": 4.415485520885887e-06, "loss": 0.4885, "step": 188 }, { "epoch": 2.82, "learning_rate": 3.76436462035219e-06, "loss": 0.4882, "step": 189 }, { "epoch": 2.84, "learning_rate": 3.1647031141851525e-06, "loss": 0.4781, "step": 190 }, { "epoch": 2.85, "learning_rate": 2.61665825322186e-06, "loss": 0.4995, "step": 191 }, { "epoch": 2.87, "learning_rate": 2.120373752728799e-06, "loss": 0.4884, "step": 192 }, { "epoch": 2.88, "learning_rate": 1.6759797547149847e-06, "loss": 0.493, "step": 193 }, { "epoch": 2.9, "learning_rate": 1.283592793804478e-06, "loss": 0.4934, "step": 194 }, { "epoch": 2.91, "learning_rate": 9.433157666772107e-07, "loss": 0.4861, "step": 195 }, { "epoch": 2.93, "learning_rate": 6.552379050861257e-07, "loss": 0.4713, "step": 196 }, { "epoch": 2.94, "learning_rate": 4.194347524576703e-07, "loss": 0.4767, "step": 197 }, { "epoch": 2.96, "learning_rate": 2.359681440818662e-07, "loss": 0.4881, "step": 198 }, { "epoch": 2.97, "learning_rate": 1.0488619089701335e-07, "loss": 0.4859, "step": 199 }, { "epoch": 2.99, "learning_rate": 2.6223266873559582e-08, "loss": 0.4896, "step": 200 }, { "epoch": 3.0, "learning_rate": 0.0, "loss": 0.4539, "step": 201 }, { "epoch": 3.0, "step": 201, "total_flos": 483875872178176.0, "train_loss": 0.7875154921367987, "train_runtime": 6065.9891, "train_samples_per_second": 4.237, "train_steps_per_second": 0.033 } ], "max_steps": 201, "num_train_epochs": 3, "total_flos": 483875872178176.0, "trial_name": null, "trial_params": null }