|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 100.0, |
|
"eval_steps": 500, |
|
"global_step": 182300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.27427317608337903, |
|
"grad_norm": 2.1429903507232666, |
|
"learning_rate": 4.986286341195831e-05, |
|
"loss": 7.2532, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5485463521667581, |
|
"grad_norm": 2.0887398719787598, |
|
"learning_rate": 4.972572682391663e-05, |
|
"loss": 6.4724, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8228195282501372, |
|
"grad_norm": 2.1614580154418945, |
|
"learning_rate": 4.958859023587493e-05, |
|
"loss": 6.0973, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0970927043335161, |
|
"grad_norm": 2.202652931213379, |
|
"learning_rate": 4.9451453647833245e-05, |
|
"loss": 5.7872, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.3713658804168953, |
|
"grad_norm": 2.147414445877075, |
|
"learning_rate": 4.931431705979155e-05, |
|
"loss": 5.5399, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.6456390565002743, |
|
"grad_norm": 2.1624412536621094, |
|
"learning_rate": 4.917718047174987e-05, |
|
"loss": 5.3515, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.9199122325836533, |
|
"grad_norm": 2.155363082885742, |
|
"learning_rate": 4.9040043883708175e-05, |
|
"loss": 5.1762, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.1941854086670323, |
|
"grad_norm": 2.2420756816864014, |
|
"learning_rate": 4.890290729566648e-05, |
|
"loss": 5.0364, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.4684585847504112, |
|
"grad_norm": 2.4155259132385254, |
|
"learning_rate": 4.87657707076248e-05, |
|
"loss": 4.9178, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.7427317608337907, |
|
"grad_norm": 2.2149574756622314, |
|
"learning_rate": 4.8628634119583105e-05, |
|
"loss": 4.8449, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.0170049369171696, |
|
"grad_norm": 2.25925350189209, |
|
"learning_rate": 4.849149753154142e-05, |
|
"loss": 4.762, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.2912781130005486, |
|
"grad_norm": 2.315990686416626, |
|
"learning_rate": 4.835436094349973e-05, |
|
"loss": 4.6713, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.5655512890839276, |
|
"grad_norm": 2.425288677215576, |
|
"learning_rate": 4.821722435545804e-05, |
|
"loss": 4.6267, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.8398244651673066, |
|
"grad_norm": 2.3451356887817383, |
|
"learning_rate": 4.808008776741635e-05, |
|
"loss": 4.579, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.1140976412506856, |
|
"grad_norm": 2.306058645248413, |
|
"learning_rate": 4.794295117937466e-05, |
|
"loss": 4.5148, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.3883708173340645, |
|
"grad_norm": 2.3386404514312744, |
|
"learning_rate": 4.780581459133297e-05, |
|
"loss": 4.4659, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.6626439934174435, |
|
"grad_norm": 2.3117551803588867, |
|
"learning_rate": 4.766867800329128e-05, |
|
"loss": 4.4327, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.9369171695008225, |
|
"grad_norm": 2.36466908454895, |
|
"learning_rate": 4.753154141524959e-05, |
|
"loss": 4.3947, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 5.2111903455842015, |
|
"grad_norm": 2.348733901977539, |
|
"learning_rate": 4.73944048272079e-05, |
|
"loss": 4.3441, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 5.485463521667581, |
|
"grad_norm": 2.9133706092834473, |
|
"learning_rate": 4.7257268239166215e-05, |
|
"loss": 4.3025, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.75973669775096, |
|
"grad_norm": 2.6369545459747314, |
|
"learning_rate": 4.712013165112452e-05, |
|
"loss": 4.2785, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 6.034009873834339, |
|
"grad_norm": 2.7040719985961914, |
|
"learning_rate": 4.698299506308283e-05, |
|
"loss": 4.2431, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 6.308283049917718, |
|
"grad_norm": 2.7137389183044434, |
|
"learning_rate": 4.6845858475041146e-05, |
|
"loss": 4.1796, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 6.582556226001097, |
|
"grad_norm": 2.963534355163574, |
|
"learning_rate": 4.670872188699945e-05, |
|
"loss": 4.1432, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 6.856829402084476, |
|
"grad_norm": 2.8243420124053955, |
|
"learning_rate": 4.6571585298957763e-05, |
|
"loss": 4.1252, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 7.131102578167855, |
|
"grad_norm": 3.03757905960083, |
|
"learning_rate": 4.6434448710916076e-05, |
|
"loss": 4.0713, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 7.405375754251234, |
|
"grad_norm": 3.007908821105957, |
|
"learning_rate": 4.629731212287439e-05, |
|
"loss": 4.0248, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 7.679648930334613, |
|
"grad_norm": 2.846788167953491, |
|
"learning_rate": 4.6160175534832694e-05, |
|
"loss": 4.0099, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 7.953922106417992, |
|
"grad_norm": 2.961183786392212, |
|
"learning_rate": 4.6023038946791006e-05, |
|
"loss": 3.9728, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 8.228195282501371, |
|
"grad_norm": 3.066025733947754, |
|
"learning_rate": 4.588590235874932e-05, |
|
"loss": 3.9118, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 8.502468458584751, |
|
"grad_norm": 2.9394822120666504, |
|
"learning_rate": 4.5748765770707624e-05, |
|
"loss": 3.8828, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 8.776741634668129, |
|
"grad_norm": 3.012153387069702, |
|
"learning_rate": 4.5611629182665936e-05, |
|
"loss": 3.8832, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 9.051014810751509, |
|
"grad_norm": 2.899332046508789, |
|
"learning_rate": 4.547449259462425e-05, |
|
"loss": 3.8481, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 9.325287986834887, |
|
"grad_norm": 3.164444923400879, |
|
"learning_rate": 4.533735600658256e-05, |
|
"loss": 3.7773, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 9.599561162918267, |
|
"grad_norm": 3.017282009124756, |
|
"learning_rate": 4.5200219418540867e-05, |
|
"loss": 3.7689, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 9.873834339001645, |
|
"grad_norm": 3.367647647857666, |
|
"learning_rate": 4.506308283049918e-05, |
|
"loss": 3.752, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 10.148107515085025, |
|
"grad_norm": 2.9855947494506836, |
|
"learning_rate": 4.492594624245749e-05, |
|
"loss": 3.7182, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 10.422380691168403, |
|
"grad_norm": 3.505870819091797, |
|
"learning_rate": 4.47888096544158e-05, |
|
"loss": 3.674, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 10.696653867251783, |
|
"grad_norm": 3.438145160675049, |
|
"learning_rate": 4.465167306637411e-05, |
|
"loss": 3.651, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 10.970927043335163, |
|
"grad_norm": 3.0687413215637207, |
|
"learning_rate": 4.451453647833242e-05, |
|
"loss": 3.6479, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 11.24520021941854, |
|
"grad_norm": 3.2287588119506836, |
|
"learning_rate": 4.4377399890290734e-05, |
|
"loss": 3.5693, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 11.51947339550192, |
|
"grad_norm": 3.3848471641540527, |
|
"learning_rate": 4.424026330224904e-05, |
|
"loss": 3.5667, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 11.793746571585299, |
|
"grad_norm": 3.5464422702789307, |
|
"learning_rate": 4.410312671420735e-05, |
|
"loss": 3.5589, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 12.068019747668679, |
|
"grad_norm": 3.6160085201263428, |
|
"learning_rate": 4.3965990126165664e-05, |
|
"loss": 3.5313, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 12.342292923752057, |
|
"grad_norm": 3.6420817375183105, |
|
"learning_rate": 4.3828853538123976e-05, |
|
"loss": 3.4855, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 12.616566099835437, |
|
"grad_norm": 3.583449363708496, |
|
"learning_rate": 4.369171695008228e-05, |
|
"loss": 3.4654, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 12.890839275918815, |
|
"grad_norm": 3.5506091117858887, |
|
"learning_rate": 4.3554580362040594e-05, |
|
"loss": 3.4512, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 13.165112452002194, |
|
"grad_norm": 4.101990699768066, |
|
"learning_rate": 4.341744377399891e-05, |
|
"loss": 3.4055, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 13.439385628085573, |
|
"grad_norm": 4.144250392913818, |
|
"learning_rate": 4.328030718595721e-05, |
|
"loss": 3.3755, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 13.713658804168952, |
|
"grad_norm": 3.6288070678710938, |
|
"learning_rate": 4.3143170597915525e-05, |
|
"loss": 3.3725, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 13.98793198025233, |
|
"grad_norm": 3.5927882194519043, |
|
"learning_rate": 4.300603400987384e-05, |
|
"loss": 3.3693, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 14.26220515633571, |
|
"grad_norm": 3.405404567718506, |
|
"learning_rate": 4.286889742183215e-05, |
|
"loss": 3.2955, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 14.53647833241909, |
|
"grad_norm": 4.086198329925537, |
|
"learning_rate": 4.2731760833790455e-05, |
|
"loss": 3.3038, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 14.810751508502468, |
|
"grad_norm": 3.3961052894592285, |
|
"learning_rate": 4.259462424574877e-05, |
|
"loss": 3.2561, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 15.085024684585848, |
|
"grad_norm": 3.6080105304718018, |
|
"learning_rate": 4.245748765770708e-05, |
|
"loss": 3.2539, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 15.359297860669226, |
|
"grad_norm": 3.597956657409668, |
|
"learning_rate": 4.2320351069665385e-05, |
|
"loss": 3.2148, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 15.633571036752606, |
|
"grad_norm": 3.466057062149048, |
|
"learning_rate": 4.21832144816237e-05, |
|
"loss": 3.2031, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 15.907844212835984, |
|
"grad_norm": 4.239918231964111, |
|
"learning_rate": 4.204607789358201e-05, |
|
"loss": 3.1768, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 16.182117388919362, |
|
"grad_norm": 4.533541202545166, |
|
"learning_rate": 4.190894130554032e-05, |
|
"loss": 3.1258, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 16.456390565002742, |
|
"grad_norm": 3.8643674850463867, |
|
"learning_rate": 4.177180471749863e-05, |
|
"loss": 3.1162, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 16.730663741086122, |
|
"grad_norm": 3.710988998413086, |
|
"learning_rate": 4.163466812945694e-05, |
|
"loss": 3.1159, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 17.004936917169502, |
|
"grad_norm": 3.6405742168426514, |
|
"learning_rate": 4.149753154141525e-05, |
|
"loss": 3.1137, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 17.27921009325288, |
|
"grad_norm": 4.127532482147217, |
|
"learning_rate": 4.136039495337356e-05, |
|
"loss": 3.0327, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 17.553483269336258, |
|
"grad_norm": 3.6551403999328613, |
|
"learning_rate": 4.122325836533188e-05, |
|
"loss": 3.0611, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 17.827756445419638, |
|
"grad_norm": 3.6437666416168213, |
|
"learning_rate": 4.108612177729018e-05, |
|
"loss": 3.0233, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 18.102029621503018, |
|
"grad_norm": 4.008886814117432, |
|
"learning_rate": 4.0948985189248495e-05, |
|
"loss": 3.0099, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 18.376302797586398, |
|
"grad_norm": 3.779545783996582, |
|
"learning_rate": 4.08118486012068e-05, |
|
"loss": 2.9663, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 18.650575973669774, |
|
"grad_norm": 3.7845826148986816, |
|
"learning_rate": 4.067471201316512e-05, |
|
"loss": 2.9628, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 18.924849149753154, |
|
"grad_norm": 3.866852283477783, |
|
"learning_rate": 4.0537575425123425e-05, |
|
"loss": 2.9649, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 19.199122325836534, |
|
"grad_norm": 3.9092442989349365, |
|
"learning_rate": 4.040043883708173e-05, |
|
"loss": 2.9199, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 19.473395501919914, |
|
"grad_norm": 4.038732528686523, |
|
"learning_rate": 4.026330224904005e-05, |
|
"loss": 2.9078, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 19.74766867800329, |
|
"grad_norm": 3.717470645904541, |
|
"learning_rate": 4.0126165660998355e-05, |
|
"loss": 2.8907, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 20.02194185408667, |
|
"grad_norm": 3.9166011810302734, |
|
"learning_rate": 3.998902907295667e-05, |
|
"loss": 2.8892, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 20.29621503017005, |
|
"grad_norm": 3.9473681449890137, |
|
"learning_rate": 3.985189248491497e-05, |
|
"loss": 2.8274, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 20.57048820625343, |
|
"grad_norm": 4.139565467834473, |
|
"learning_rate": 3.971475589687329e-05, |
|
"loss": 2.8371, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 20.844761382336806, |
|
"grad_norm": 3.7124762535095215, |
|
"learning_rate": 3.95776193088316e-05, |
|
"loss": 2.8401, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 21.119034558420186, |
|
"grad_norm": 3.772149085998535, |
|
"learning_rate": 3.9440482720789904e-05, |
|
"loss": 2.8083, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 21.393307734503566, |
|
"grad_norm": 4.185425758361816, |
|
"learning_rate": 3.930334613274822e-05, |
|
"loss": 2.7767, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 21.667580910586945, |
|
"grad_norm": 3.800649881362915, |
|
"learning_rate": 3.916620954470653e-05, |
|
"loss": 2.7723, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 21.941854086670325, |
|
"grad_norm": 3.800741195678711, |
|
"learning_rate": 3.902907295666484e-05, |
|
"loss": 2.774, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 22.2161272627537, |
|
"grad_norm": 3.7834713459014893, |
|
"learning_rate": 3.889193636862315e-05, |
|
"loss": 2.7329, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 22.49040043883708, |
|
"grad_norm": 4.18643045425415, |
|
"learning_rate": 3.8754799780581465e-05, |
|
"loss": 2.7181, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 22.76467361492046, |
|
"grad_norm": 3.758415460586548, |
|
"learning_rate": 3.861766319253977e-05, |
|
"loss": 2.7148, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 23.03894679100384, |
|
"grad_norm": 4.028139114379883, |
|
"learning_rate": 3.8480526604498076e-05, |
|
"loss": 2.7025, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 23.313219967087218, |
|
"grad_norm": 3.779428243637085, |
|
"learning_rate": 3.8343390016456395e-05, |
|
"loss": 2.6542, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 23.587493143170597, |
|
"grad_norm": 3.6896047592163086, |
|
"learning_rate": 3.82062534284147e-05, |
|
"loss": 2.6662, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 23.861766319253977, |
|
"grad_norm": 3.527114152908325, |
|
"learning_rate": 3.806911684037301e-05, |
|
"loss": 2.6697, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 24.136039495337357, |
|
"grad_norm": 3.832408905029297, |
|
"learning_rate": 3.7931980252331326e-05, |
|
"loss": 2.6443, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 24.410312671420733, |
|
"grad_norm": 3.912022352218628, |
|
"learning_rate": 3.779484366428964e-05, |
|
"loss": 2.6098, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 24.684585847504113, |
|
"grad_norm": 3.832465648651123, |
|
"learning_rate": 3.7657707076247944e-05, |
|
"loss": 2.6187, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 24.958859023587493, |
|
"grad_norm": 4.292030334472656, |
|
"learning_rate": 3.752057048820625e-05, |
|
"loss": 2.6216, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 25.233132199670873, |
|
"grad_norm": 3.9441139698028564, |
|
"learning_rate": 3.738343390016457e-05, |
|
"loss": 2.5674, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 25.507405375754253, |
|
"grad_norm": 3.8073363304138184, |
|
"learning_rate": 3.7246297312122874e-05, |
|
"loss": 2.5613, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 25.78167855183763, |
|
"grad_norm": 3.95381498336792, |
|
"learning_rate": 3.7109160724081186e-05, |
|
"loss": 2.5744, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 26.05595172792101, |
|
"grad_norm": 4.265843391418457, |
|
"learning_rate": 3.69720241360395e-05, |
|
"loss": 2.5581, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 26.33022490400439, |
|
"grad_norm": 3.7907886505126953, |
|
"learning_rate": 3.683488754799781e-05, |
|
"loss": 2.5085, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 26.60449808008777, |
|
"grad_norm": 3.9580938816070557, |
|
"learning_rate": 3.6697750959956116e-05, |
|
"loss": 2.5244, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 26.878771256171145, |
|
"grad_norm": 3.725271701812744, |
|
"learning_rate": 3.656061437191443e-05, |
|
"loss": 2.5283, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 27.153044432254525, |
|
"grad_norm": 4.27789831161499, |
|
"learning_rate": 3.642347778387274e-05, |
|
"loss": 2.4978, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 27.427317608337905, |
|
"grad_norm": 4.248454570770264, |
|
"learning_rate": 3.628634119583105e-05, |
|
"loss": 2.478, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 27.701590784421285, |
|
"grad_norm": 3.7782256603240967, |
|
"learning_rate": 3.614920460778936e-05, |
|
"loss": 2.4797, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 27.97586396050466, |
|
"grad_norm": 3.996277332305908, |
|
"learning_rate": 3.601206801974767e-05, |
|
"loss": 2.4872, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 28.25013713658804, |
|
"grad_norm": 4.143040657043457, |
|
"learning_rate": 3.5874931431705984e-05, |
|
"loss": 2.4311, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 28.52441031267142, |
|
"grad_norm": 3.6849589347839355, |
|
"learning_rate": 3.573779484366429e-05, |
|
"loss": 2.4377, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 28.7986834887548, |
|
"grad_norm": 3.621760606765747, |
|
"learning_rate": 3.56006582556226e-05, |
|
"loss": 2.4425, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 29.07295666483818, |
|
"grad_norm": 3.7394306659698486, |
|
"learning_rate": 3.5463521667580914e-05, |
|
"loss": 2.4287, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 29.347229840921557, |
|
"grad_norm": 3.782111167907715, |
|
"learning_rate": 3.532638507953922e-05, |
|
"loss": 2.3911, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 29.621503017004937, |
|
"grad_norm": 4.35050106048584, |
|
"learning_rate": 3.518924849149753e-05, |
|
"loss": 2.4084, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 29.895776193088317, |
|
"grad_norm": 3.8727004528045654, |
|
"learning_rate": 3.5052111903455844e-05, |
|
"loss": 2.3928, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 30.170049369171696, |
|
"grad_norm": 3.974501371383667, |
|
"learning_rate": 3.4914975315414157e-05, |
|
"loss": 2.3668, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 30.444322545255073, |
|
"grad_norm": 3.7882275581359863, |
|
"learning_rate": 3.477783872737246e-05, |
|
"loss": 2.3686, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 30.718595721338453, |
|
"grad_norm": 3.8313581943511963, |
|
"learning_rate": 3.4640702139330774e-05, |
|
"loss": 2.3707, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 30.992868897421832, |
|
"grad_norm": 4.046344757080078, |
|
"learning_rate": 3.450356555128909e-05, |
|
"loss": 2.3632, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 31.267142073505212, |
|
"grad_norm": 3.7165708541870117, |
|
"learning_rate": 3.436642896324739e-05, |
|
"loss": 2.3229, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 31.54141524958859, |
|
"grad_norm": 3.8072948455810547, |
|
"learning_rate": 3.4229292375205705e-05, |
|
"loss": 2.3262, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 31.81568842567197, |
|
"grad_norm": 3.8423380851745605, |
|
"learning_rate": 3.409215578716402e-05, |
|
"loss": 2.3311, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 32.08996160175535, |
|
"grad_norm": 3.831343650817871, |
|
"learning_rate": 3.395501919912233e-05, |
|
"loss": 2.3213, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 32.364234777838725, |
|
"grad_norm": 3.8060476779937744, |
|
"learning_rate": 3.3817882611080635e-05, |
|
"loss": 2.2849, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 32.63850795392211, |
|
"grad_norm": 4.033987998962402, |
|
"learning_rate": 3.368074602303895e-05, |
|
"loss": 2.2979, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 32.912781130005484, |
|
"grad_norm": 3.870171546936035, |
|
"learning_rate": 3.354360943499726e-05, |
|
"loss": 2.2982, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 33.18705430608887, |
|
"grad_norm": 3.848620653152466, |
|
"learning_rate": 3.340647284695557e-05, |
|
"loss": 2.264, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 33.461327482172244, |
|
"grad_norm": 4.048386573791504, |
|
"learning_rate": 3.326933625891388e-05, |
|
"loss": 2.2568, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 33.73560065825562, |
|
"grad_norm": 4.029069900512695, |
|
"learning_rate": 3.313219967087219e-05, |
|
"loss": 2.2602, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 34.009873834339004, |
|
"grad_norm": 3.759799003601074, |
|
"learning_rate": 3.29950630828305e-05, |
|
"loss": 2.2763, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 34.28414701042238, |
|
"grad_norm": 4.440002918243408, |
|
"learning_rate": 3.285792649478881e-05, |
|
"loss": 2.2137, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 34.55842018650576, |
|
"grad_norm": 3.961390972137451, |
|
"learning_rate": 3.272078990674713e-05, |
|
"loss": 2.233, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 34.83269336258914, |
|
"grad_norm": 3.916156768798828, |
|
"learning_rate": 3.258365331870543e-05, |
|
"loss": 2.2414, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 35.106966538672516, |
|
"grad_norm": 3.746572732925415, |
|
"learning_rate": 3.2446516730663745e-05, |
|
"loss": 2.2137, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 35.3812397147559, |
|
"grad_norm": 3.8424971103668213, |
|
"learning_rate": 3.230938014262205e-05, |
|
"loss": 2.1907, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 35.655512890839276, |
|
"grad_norm": 4.250007152557373, |
|
"learning_rate": 3.217224355458036e-05, |
|
"loss": 2.2104, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 35.92978606692265, |
|
"grad_norm": 3.760779857635498, |
|
"learning_rate": 3.2035106966538675e-05, |
|
"loss": 2.2134, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 36.204059243006036, |
|
"grad_norm": 4.189092636108398, |
|
"learning_rate": 3.189797037849698e-05, |
|
"loss": 2.1714, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 36.47833241908941, |
|
"grad_norm": 3.6505000591278076, |
|
"learning_rate": 3.17608337904553e-05, |
|
"loss": 2.1635, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 36.752605595172795, |
|
"grad_norm": 3.7696096897125244, |
|
"learning_rate": 3.1623697202413605e-05, |
|
"loss": 2.1832, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 37.02687877125617, |
|
"grad_norm": 3.744976282119751, |
|
"learning_rate": 3.148656061437192e-05, |
|
"loss": 2.1753, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 37.30115194733955, |
|
"grad_norm": 3.9759116172790527, |
|
"learning_rate": 3.134942402633022e-05, |
|
"loss": 2.1326, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 37.57542512342293, |
|
"grad_norm": 3.719237804412842, |
|
"learning_rate": 3.1212287438288536e-05, |
|
"loss": 2.1423, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 37.84969829950631, |
|
"grad_norm": 4.290117263793945, |
|
"learning_rate": 3.107515085024685e-05, |
|
"loss": 2.1602, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 38.123971475589684, |
|
"grad_norm": 4.060342311859131, |
|
"learning_rate": 3.0938014262205153e-05, |
|
"loss": 2.1349, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 38.39824465167307, |
|
"grad_norm": 4.048706531524658, |
|
"learning_rate": 3.080087767416347e-05, |
|
"loss": 2.1018, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 38.672517827756444, |
|
"grad_norm": 4.130014896392822, |
|
"learning_rate": 3.066374108612178e-05, |
|
"loss": 2.1284, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 38.94679100383983, |
|
"grad_norm": 3.893848419189453, |
|
"learning_rate": 3.052660449808009e-05, |
|
"loss": 2.1255, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 39.221064179923204, |
|
"grad_norm": 3.6254563331604004, |
|
"learning_rate": 3.03894679100384e-05, |
|
"loss": 2.0848, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 39.49533735600658, |
|
"grad_norm": 4.06374979019165, |
|
"learning_rate": 3.025233132199671e-05, |
|
"loss": 2.0933, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 39.76961053208996, |
|
"grad_norm": 3.763274908065796, |
|
"learning_rate": 3.011519473395502e-05, |
|
"loss": 2.11, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 40.04388370817334, |
|
"grad_norm": 4.014530181884766, |
|
"learning_rate": 2.997805814591333e-05, |
|
"loss": 2.0918, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 40.31815688425672, |
|
"grad_norm": 3.6787962913513184, |
|
"learning_rate": 2.9840921557871642e-05, |
|
"loss": 2.0559, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 40.5924300603401, |
|
"grad_norm": 3.752711057662964, |
|
"learning_rate": 2.970378496982995e-05, |
|
"loss": 2.067, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 40.866703236423476, |
|
"grad_norm": 3.795217752456665, |
|
"learning_rate": 2.9566648381788263e-05, |
|
"loss": 2.0792, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 41.14097641250686, |
|
"grad_norm": 3.7484569549560547, |
|
"learning_rate": 2.9429511793746572e-05, |
|
"loss": 2.0608, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 41.415249588590235, |
|
"grad_norm": 3.601229190826416, |
|
"learning_rate": 2.9292375205704885e-05, |
|
"loss": 2.0337, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 41.68952276467361, |
|
"grad_norm": 3.9707863330841064, |
|
"learning_rate": 2.9155238617663194e-05, |
|
"loss": 2.0426, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 41.963795940756995, |
|
"grad_norm": 3.9523677825927734, |
|
"learning_rate": 2.9018102029621502e-05, |
|
"loss": 2.0571, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 42.23806911684037, |
|
"grad_norm": 4.046602725982666, |
|
"learning_rate": 2.8880965441579815e-05, |
|
"loss": 2.0225, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 42.512342292923755, |
|
"grad_norm": 4.059443950653076, |
|
"learning_rate": 2.8743828853538124e-05, |
|
"loss": 2.0185, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 42.78661546900713, |
|
"grad_norm": 4.066934108734131, |
|
"learning_rate": 2.8606692265496436e-05, |
|
"loss": 2.0342, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 43.06088864509051, |
|
"grad_norm": 3.811591386795044, |
|
"learning_rate": 2.8469555677454745e-05, |
|
"loss": 2.0216, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 43.33516182117389, |
|
"grad_norm": 3.979374408721924, |
|
"learning_rate": 2.833241908941306e-05, |
|
"loss": 1.989, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 43.60943499725727, |
|
"grad_norm": 3.67275333404541, |
|
"learning_rate": 2.8195282501371366e-05, |
|
"loss": 1.9958, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 43.88370817334065, |
|
"grad_norm": 3.790217399597168, |
|
"learning_rate": 2.8058145913329675e-05, |
|
"loss": 2.0102, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 44.15798134942403, |
|
"grad_norm": 3.9934496879577637, |
|
"learning_rate": 2.7921009325287988e-05, |
|
"loss": 1.9847, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 44.4322545255074, |
|
"grad_norm": 4.339521408081055, |
|
"learning_rate": 2.7783872737246297e-05, |
|
"loss": 1.9752, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 44.70652770159079, |
|
"grad_norm": 3.5851519107818604, |
|
"learning_rate": 2.764673614920461e-05, |
|
"loss": 1.9883, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 44.98080087767416, |
|
"grad_norm": 4.129305362701416, |
|
"learning_rate": 2.7509599561162918e-05, |
|
"loss": 1.9896, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 45.25507405375754, |
|
"grad_norm": 3.752852201461792, |
|
"learning_rate": 2.7372462973121234e-05, |
|
"loss": 1.9465, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 45.52934722984092, |
|
"grad_norm": 3.923309087753296, |
|
"learning_rate": 2.723532638507954e-05, |
|
"loss": 1.9589, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 45.8036204059243, |
|
"grad_norm": 4.141747951507568, |
|
"learning_rate": 2.7098189797037848e-05, |
|
"loss": 1.9662, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 46.07789358200768, |
|
"grad_norm": 4.118216514587402, |
|
"learning_rate": 2.696105320899616e-05, |
|
"loss": 1.9518, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 46.35216675809106, |
|
"grad_norm": 4.061371326446533, |
|
"learning_rate": 2.682391662095447e-05, |
|
"loss": 1.928, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 46.626439934174435, |
|
"grad_norm": 4.138849258422852, |
|
"learning_rate": 2.6686780032912785e-05, |
|
"loss": 1.9456, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 46.90071311025782, |
|
"grad_norm": 3.9675650596618652, |
|
"learning_rate": 2.654964344487109e-05, |
|
"loss": 1.9465, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 47.174986286341195, |
|
"grad_norm": 3.745779514312744, |
|
"learning_rate": 2.6412506856829406e-05, |
|
"loss": 1.9293, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 47.44925946242458, |
|
"grad_norm": 3.6988871097564697, |
|
"learning_rate": 2.6275370268787712e-05, |
|
"loss": 1.915, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 47.723532638507955, |
|
"grad_norm": 3.7044730186462402, |
|
"learning_rate": 2.613823368074602e-05, |
|
"loss": 1.9199, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 47.99780581459133, |
|
"grad_norm": 3.6700057983398438, |
|
"learning_rate": 2.6001097092704337e-05, |
|
"loss": 1.9243, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 48.272078990674714, |
|
"grad_norm": 3.89973521232605, |
|
"learning_rate": 2.5863960504662642e-05, |
|
"loss": 1.8846, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 48.54635216675809, |
|
"grad_norm": 4.041015625, |
|
"learning_rate": 2.5726823916620958e-05, |
|
"loss": 1.8999, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 48.82062534284147, |
|
"grad_norm": 3.7937917709350586, |
|
"learning_rate": 2.5589687328579264e-05, |
|
"loss": 1.9085, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 49.09489851892485, |
|
"grad_norm": 4.050382614135742, |
|
"learning_rate": 2.545255074053758e-05, |
|
"loss": 1.8934, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 49.36917169500823, |
|
"grad_norm": 3.809558391571045, |
|
"learning_rate": 2.5315414152495888e-05, |
|
"loss": 1.8705, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 49.64344487109161, |
|
"grad_norm": 3.6460201740264893, |
|
"learning_rate": 2.51782775644542e-05, |
|
"loss": 1.8904, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 49.917718047174986, |
|
"grad_norm": 3.959718704223633, |
|
"learning_rate": 2.504114097641251e-05, |
|
"loss": 1.8936, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 50.19199122325836, |
|
"grad_norm": 3.786888837814331, |
|
"learning_rate": 2.490400438837082e-05, |
|
"loss": 1.8683, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 50.466264399341746, |
|
"grad_norm": 3.477952241897583, |
|
"learning_rate": 2.476686780032913e-05, |
|
"loss": 1.8634, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 50.74053757542512, |
|
"grad_norm": 3.998764991760254, |
|
"learning_rate": 2.462973121228744e-05, |
|
"loss": 1.8637, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 51.014810751508506, |
|
"grad_norm": 4.029101848602295, |
|
"learning_rate": 2.449259462424575e-05, |
|
"loss": 1.8749, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 51.28908392759188, |
|
"grad_norm": 3.8711071014404297, |
|
"learning_rate": 2.435545803620406e-05, |
|
"loss": 1.8377, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 51.56335710367526, |
|
"grad_norm": 3.922783136367798, |
|
"learning_rate": 2.421832144816237e-05, |
|
"loss": 1.8397, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 51.83763027975864, |
|
"grad_norm": 4.025134086608887, |
|
"learning_rate": 2.4081184860120682e-05, |
|
"loss": 1.8609, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 52.11190345584202, |
|
"grad_norm": 3.800508975982666, |
|
"learning_rate": 2.394404827207899e-05, |
|
"loss": 1.8407, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 52.386176631925395, |
|
"grad_norm": 3.944465160369873, |
|
"learning_rate": 2.3806911684037304e-05, |
|
"loss": 1.8263, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 52.66044980800878, |
|
"grad_norm": 4.014648914337158, |
|
"learning_rate": 2.3669775095995613e-05, |
|
"loss": 1.8374, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 52.934722984092154, |
|
"grad_norm": 4.08259916305542, |
|
"learning_rate": 2.353263850795392e-05, |
|
"loss": 1.8448, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 53.20899616017554, |
|
"grad_norm": 3.941981792449951, |
|
"learning_rate": 2.3395501919912234e-05, |
|
"loss": 1.8131, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 53.483269336258914, |
|
"grad_norm": 3.8573715686798096, |
|
"learning_rate": 2.3258365331870543e-05, |
|
"loss": 1.8169, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 53.75754251234229, |
|
"grad_norm": 3.987938165664673, |
|
"learning_rate": 2.3121228743828855e-05, |
|
"loss": 1.8145, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 54.031815688425674, |
|
"grad_norm": 3.652238607406616, |
|
"learning_rate": 2.2984092155787164e-05, |
|
"loss": 1.8213, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 54.30608886450905, |
|
"grad_norm": 3.640587568283081, |
|
"learning_rate": 2.2846955567745476e-05, |
|
"loss": 1.7938, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 54.58036204059243, |
|
"grad_norm": 3.884443759918213, |
|
"learning_rate": 2.2709818979703785e-05, |
|
"loss": 1.7963, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 54.85463521667581, |
|
"grad_norm": 4.245452880859375, |
|
"learning_rate": 2.2572682391662098e-05, |
|
"loss": 1.811, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 55.128908392759186, |
|
"grad_norm": 3.97247576713562, |
|
"learning_rate": 2.2435545803620407e-05, |
|
"loss": 1.7964, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 55.40318156884257, |
|
"grad_norm": 3.8827009201049805, |
|
"learning_rate": 2.2298409215578716e-05, |
|
"loss": 1.7774, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 55.677454744925946, |
|
"grad_norm": 4.079446792602539, |
|
"learning_rate": 2.2161272627537028e-05, |
|
"loss": 1.7884, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 55.95172792100932, |
|
"grad_norm": 4.093244552612305, |
|
"learning_rate": 2.2024136039495337e-05, |
|
"loss": 1.7904, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 56.226001097092706, |
|
"grad_norm": 3.674686908721924, |
|
"learning_rate": 2.188699945145365e-05, |
|
"loss": 1.7655, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 56.50027427317608, |
|
"grad_norm": 4.042862892150879, |
|
"learning_rate": 2.174986286341196e-05, |
|
"loss": 1.7708, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 56.774547449259465, |
|
"grad_norm": 4.069617748260498, |
|
"learning_rate": 2.161272627537027e-05, |
|
"loss": 1.777, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 57.04882062534284, |
|
"grad_norm": 3.68752384185791, |
|
"learning_rate": 2.1475589687328583e-05, |
|
"loss": 1.7702, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 57.32309380142622, |
|
"grad_norm": 3.998215436935425, |
|
"learning_rate": 2.133845309928689e-05, |
|
"loss": 1.7508, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 57.5973669775096, |
|
"grad_norm": 4.300554275512695, |
|
"learning_rate": 2.12013165112452e-05, |
|
"loss": 1.7634, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 57.87164015359298, |
|
"grad_norm": 3.649411678314209, |
|
"learning_rate": 2.106417992320351e-05, |
|
"loss": 1.7603, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 58.14591332967636, |
|
"grad_norm": 4.17492151260376, |
|
"learning_rate": 2.0927043335161822e-05, |
|
"loss": 1.7441, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 58.42018650575974, |
|
"grad_norm": 3.8550057411193848, |
|
"learning_rate": 2.0789906747120134e-05, |
|
"loss": 1.7381, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 58.694459681843114, |
|
"grad_norm": 4.004961967468262, |
|
"learning_rate": 2.0652770159078443e-05, |
|
"loss": 1.7475, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 58.9687328579265, |
|
"grad_norm": 3.936483144760132, |
|
"learning_rate": 2.0515633571036756e-05, |
|
"loss": 1.7532, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 59.24300603400987, |
|
"grad_norm": 3.812488317489624, |
|
"learning_rate": 2.037849698299506e-05, |
|
"loss": 1.7232, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 59.51727921009325, |
|
"grad_norm": 4.185512542724609, |
|
"learning_rate": 2.0241360394953374e-05, |
|
"loss": 1.7196, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 59.79155238617663, |
|
"grad_norm": 4.278858184814453, |
|
"learning_rate": 2.0104223806911686e-05, |
|
"loss": 1.7356, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 60.06582556226001, |
|
"grad_norm": 4.104213714599609, |
|
"learning_rate": 1.9967087218869995e-05, |
|
"loss": 1.7328, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 60.34009873834339, |
|
"grad_norm": 4.215428352355957, |
|
"learning_rate": 1.9829950630828307e-05, |
|
"loss": 1.7109, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 60.61437191442677, |
|
"grad_norm": 4.020122528076172, |
|
"learning_rate": 1.9692814042786616e-05, |
|
"loss": 1.7144, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 60.888645090510146, |
|
"grad_norm": 3.9703729152679443, |
|
"learning_rate": 1.955567745474493e-05, |
|
"loss": 1.7335, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 61.16291826659353, |
|
"grad_norm": 3.900017023086548, |
|
"learning_rate": 1.9418540866703238e-05, |
|
"loss": 1.7048, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 61.437191442676905, |
|
"grad_norm": 4.7137627601623535, |
|
"learning_rate": 1.9281404278661547e-05, |
|
"loss": 1.6902, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 61.71146461876029, |
|
"grad_norm": 4.035908222198486, |
|
"learning_rate": 1.914426769061986e-05, |
|
"loss": 1.7104, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 61.985737794843665, |
|
"grad_norm": 4.041805744171143, |
|
"learning_rate": 1.9007131102578168e-05, |
|
"loss": 1.7155, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 62.26001097092704, |
|
"grad_norm": 3.817702054977417, |
|
"learning_rate": 1.886999451453648e-05, |
|
"loss": 1.6851, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 62.534284147010425, |
|
"grad_norm": 3.8696234226226807, |
|
"learning_rate": 1.873285792649479e-05, |
|
"loss": 1.6857, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 62.8085573230938, |
|
"grad_norm": 3.909179925918579, |
|
"learning_rate": 1.85957213384531e-05, |
|
"loss": 1.7037, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 63.08283049917718, |
|
"grad_norm": 3.8557326793670654, |
|
"learning_rate": 1.845858475041141e-05, |
|
"loss": 1.6936, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 63.35710367526056, |
|
"grad_norm": 4.332828044891357, |
|
"learning_rate": 1.832144816236972e-05, |
|
"loss": 1.6786, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 63.63137685134394, |
|
"grad_norm": 4.454130172729492, |
|
"learning_rate": 1.818431157432803e-05, |
|
"loss": 1.6768, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 63.90565002742732, |
|
"grad_norm": 3.943071126937866, |
|
"learning_rate": 1.804717498628634e-05, |
|
"loss": 1.6905, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 64.1799232035107, |
|
"grad_norm": 4.255739688873291, |
|
"learning_rate": 1.7910038398244653e-05, |
|
"loss": 1.663, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 64.45419637959408, |
|
"grad_norm": 4.027384281158447, |
|
"learning_rate": 1.7772901810202962e-05, |
|
"loss": 1.6603, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 64.72846955567745, |
|
"grad_norm": 3.8232147693634033, |
|
"learning_rate": 1.7635765222161274e-05, |
|
"loss": 1.6701, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 65.00274273176083, |
|
"grad_norm": 3.83734130859375, |
|
"learning_rate": 1.7498628634119583e-05, |
|
"loss": 1.6797, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 65.27701590784422, |
|
"grad_norm": 3.9775922298431396, |
|
"learning_rate": 1.7361492046077896e-05, |
|
"loss": 1.6441, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 65.55128908392759, |
|
"grad_norm": 3.810086250305176, |
|
"learning_rate": 1.7224355458036205e-05, |
|
"loss": 1.6534, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 65.82556226001097, |
|
"grad_norm": 4.3292036056518555, |
|
"learning_rate": 1.7087218869994513e-05, |
|
"loss": 1.6651, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 66.09983543609435, |
|
"grad_norm": 4.046462535858154, |
|
"learning_rate": 1.6950082281952826e-05, |
|
"loss": 1.654, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 66.37410861217774, |
|
"grad_norm": 4.200257778167725, |
|
"learning_rate": 1.6812945693911135e-05, |
|
"loss": 1.6383, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 66.6483817882611, |
|
"grad_norm": 3.98045015335083, |
|
"learning_rate": 1.6675809105869447e-05, |
|
"loss": 1.6571, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 66.92265496434449, |
|
"grad_norm": 3.9323537349700928, |
|
"learning_rate": 1.653867251782776e-05, |
|
"loss": 1.6486, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 67.19692814042787, |
|
"grad_norm": 3.898150682449341, |
|
"learning_rate": 1.640153592978607e-05, |
|
"loss": 1.6288, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 67.47120131651124, |
|
"grad_norm": 3.8490869998931885, |
|
"learning_rate": 1.6264399341744377e-05, |
|
"loss": 1.6305, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 67.74547449259462, |
|
"grad_norm": 4.125833034515381, |
|
"learning_rate": 1.6127262753702686e-05, |
|
"loss": 1.6393, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 68.01974766867801, |
|
"grad_norm": 4.1837286949157715, |
|
"learning_rate": 1.5990126165661e-05, |
|
"loss": 1.6441, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 68.29402084476138, |
|
"grad_norm": 4.150059700012207, |
|
"learning_rate": 1.585298957761931e-05, |
|
"loss": 1.6144, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 68.56829402084476, |
|
"grad_norm": 4.325094223022461, |
|
"learning_rate": 1.571585298957762e-05, |
|
"loss": 1.6254, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 68.84256719692814, |
|
"grad_norm": 3.9832139015197754, |
|
"learning_rate": 1.5578716401535932e-05, |
|
"loss": 1.636, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 69.11684037301151, |
|
"grad_norm": 3.9516079425811768, |
|
"learning_rate": 1.544157981349424e-05, |
|
"loss": 1.6183, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 69.3911135490949, |
|
"grad_norm": 3.982802391052246, |
|
"learning_rate": 1.5304443225452554e-05, |
|
"loss": 1.6116, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 69.66538672517828, |
|
"grad_norm": 4.178645610809326, |
|
"learning_rate": 1.516730663741086e-05, |
|
"loss": 1.6183, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 69.93965990126166, |
|
"grad_norm": 4.045616149902344, |
|
"learning_rate": 1.5030170049369171e-05, |
|
"loss": 1.6226, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 70.21393307734503, |
|
"grad_norm": 4.098151206970215, |
|
"learning_rate": 1.4893033461327482e-05, |
|
"loss": 1.5999, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 70.48820625342842, |
|
"grad_norm": 4.052021026611328, |
|
"learning_rate": 1.4755896873285793e-05, |
|
"loss": 1.5946, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 70.7624794295118, |
|
"grad_norm": 3.7580652236938477, |
|
"learning_rate": 1.4618760285244103e-05, |
|
"loss": 1.6172, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 71.03675260559517, |
|
"grad_norm": 4.1435866355896, |
|
"learning_rate": 1.4481623697202416e-05, |
|
"loss": 1.6121, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 71.31102578167855, |
|
"grad_norm": 3.739654302597046, |
|
"learning_rate": 1.4344487109160726e-05, |
|
"loss": 1.588, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 71.58529895776194, |
|
"grad_norm": 4.380291938781738, |
|
"learning_rate": 1.4207350521119034e-05, |
|
"loss": 1.5998, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 71.8595721338453, |
|
"grad_norm": 3.7885782718658447, |
|
"learning_rate": 1.4070213933077344e-05, |
|
"loss": 1.5958, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 72.13384530992869, |
|
"grad_norm": 4.236293792724609, |
|
"learning_rate": 1.3933077345035655e-05, |
|
"loss": 1.5956, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 72.40811848601207, |
|
"grad_norm": 4.205173015594482, |
|
"learning_rate": 1.3795940756993966e-05, |
|
"loss": 1.5766, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 72.68239166209544, |
|
"grad_norm": 4.034268379211426, |
|
"learning_rate": 1.3658804168952278e-05, |
|
"loss": 1.5912, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 72.95666483817882, |
|
"grad_norm": 3.9170260429382324, |
|
"learning_rate": 1.3521667580910589e-05, |
|
"loss": 1.5897, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 73.23093801426221, |
|
"grad_norm": 3.925799608230591, |
|
"learning_rate": 1.33845309928689e-05, |
|
"loss": 1.5765, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 73.50521119034559, |
|
"grad_norm": 4.052227973937988, |
|
"learning_rate": 1.324739440482721e-05, |
|
"loss": 1.5775, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 73.77948436642896, |
|
"grad_norm": 4.2378034591674805, |
|
"learning_rate": 1.3110257816785517e-05, |
|
"loss": 1.5834, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 74.05375754251234, |
|
"grad_norm": 4.073320388793945, |
|
"learning_rate": 1.297312122874383e-05, |
|
"loss": 1.5812, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 74.32803071859573, |
|
"grad_norm": 4.102873802185059, |
|
"learning_rate": 1.283598464070214e-05, |
|
"loss": 1.5588, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 74.6023038946791, |
|
"grad_norm": 4.223252773284912, |
|
"learning_rate": 1.269884805266045e-05, |
|
"loss": 1.5785, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 74.87657707076248, |
|
"grad_norm": 4.320130825042725, |
|
"learning_rate": 1.2561711464618761e-05, |
|
"loss": 1.5686, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 75.15085024684586, |
|
"grad_norm": 4.706448078155518, |
|
"learning_rate": 1.2424574876577072e-05, |
|
"loss": 1.5688, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 75.42512342292923, |
|
"grad_norm": 4.024387359619141, |
|
"learning_rate": 1.2287438288535381e-05, |
|
"loss": 1.5563, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 75.69939659901262, |
|
"grad_norm": 3.9221880435943604, |
|
"learning_rate": 1.2150301700493692e-05, |
|
"loss": 1.5712, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 75.973669775096, |
|
"grad_norm": 4.27291202545166, |
|
"learning_rate": 1.2013165112452002e-05, |
|
"loss": 1.5654, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 76.24794295117937, |
|
"grad_norm": 4.373564720153809, |
|
"learning_rate": 1.1876028524410313e-05, |
|
"loss": 1.5449, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 76.52221612726275, |
|
"grad_norm": 4.030310153961182, |
|
"learning_rate": 1.1738891936368624e-05, |
|
"loss": 1.5571, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 76.79648930334614, |
|
"grad_norm": 4.002580165863037, |
|
"learning_rate": 1.1601755348326934e-05, |
|
"loss": 1.5568, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 77.07076247942952, |
|
"grad_norm": 4.0623369216918945, |
|
"learning_rate": 1.1464618760285245e-05, |
|
"loss": 1.5545, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 77.34503565551289, |
|
"grad_norm": 4.049304485321045, |
|
"learning_rate": 1.1327482172243554e-05, |
|
"loss": 1.5442, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 77.61930883159627, |
|
"grad_norm": 3.891969680786133, |
|
"learning_rate": 1.1190345584201866e-05, |
|
"loss": 1.5464, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 77.89358200767965, |
|
"grad_norm": 4.165316104888916, |
|
"learning_rate": 1.1053208996160177e-05, |
|
"loss": 1.5509, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 78.16785518376302, |
|
"grad_norm": 4.1472249031066895, |
|
"learning_rate": 1.0916072408118487e-05, |
|
"loss": 1.5375, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 78.44212835984641, |
|
"grad_norm": 4.173414707183838, |
|
"learning_rate": 1.0778935820076796e-05, |
|
"loss": 1.5345, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 78.71640153592979, |
|
"grad_norm": 3.9279398918151855, |
|
"learning_rate": 1.0641799232035107e-05, |
|
"loss": 1.5381, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 78.99067471201316, |
|
"grad_norm": 4.222446441650391, |
|
"learning_rate": 1.0504662643993418e-05, |
|
"loss": 1.552, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 79.26494788809654, |
|
"grad_norm": 3.8020248413085938, |
|
"learning_rate": 1.0367526055951728e-05, |
|
"loss": 1.5217, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 79.53922106417993, |
|
"grad_norm": 3.8790934085845947, |
|
"learning_rate": 1.0230389467910039e-05, |
|
"loss": 1.5346, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 79.8134942402633, |
|
"grad_norm": 4.49297571182251, |
|
"learning_rate": 1.009325287986835e-05, |
|
"loss": 1.5354, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 80.08776741634668, |
|
"grad_norm": 4.024161338806152, |
|
"learning_rate": 9.95611629182666e-06, |
|
"loss": 1.5326, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 80.36204059243006, |
|
"grad_norm": 3.997326374053955, |
|
"learning_rate": 9.818979703784971e-06, |
|
"loss": 1.5133, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 80.63631376851345, |
|
"grad_norm": 4.163906574249268, |
|
"learning_rate": 9.68184311574328e-06, |
|
"loss": 1.5252, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 80.91058694459682, |
|
"grad_norm": 4.333358287811279, |
|
"learning_rate": 9.54470652770159e-06, |
|
"loss": 1.5356, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 81.1848601206802, |
|
"grad_norm": 4.201995372772217, |
|
"learning_rate": 9.407569939659903e-06, |
|
"loss": 1.5191, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 81.45913329676358, |
|
"grad_norm": 3.8196020126342773, |
|
"learning_rate": 9.270433351618212e-06, |
|
"loss": 1.5168, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 81.73340647284695, |
|
"grad_norm": 4.32403039932251, |
|
"learning_rate": 9.133296763576522e-06, |
|
"loss": 1.5214, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 82.00767964893033, |
|
"grad_norm": 4.165477752685547, |
|
"learning_rate": 8.996160175534833e-06, |
|
"loss": 1.5242, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 82.28195282501372, |
|
"grad_norm": 4.155007362365723, |
|
"learning_rate": 8.859023587493144e-06, |
|
"loss": 1.5022, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 82.55622600109709, |
|
"grad_norm": 4.049638748168945, |
|
"learning_rate": 8.721886999451453e-06, |
|
"loss": 1.5172, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 82.83049917718047, |
|
"grad_norm": 4.376342296600342, |
|
"learning_rate": 8.584750411409765e-06, |
|
"loss": 1.5097, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 83.10477235326385, |
|
"grad_norm": 4.413540363311768, |
|
"learning_rate": 8.447613823368076e-06, |
|
"loss": 1.5163, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 83.37904552934722, |
|
"grad_norm": 3.754805326461792, |
|
"learning_rate": 8.310477235326386e-06, |
|
"loss": 1.5011, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 83.6533187054306, |
|
"grad_norm": 4.167300224304199, |
|
"learning_rate": 8.173340647284695e-06, |
|
"loss": 1.5026, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 83.92759188151399, |
|
"grad_norm": 4.1614861488342285, |
|
"learning_rate": 8.036204059243006e-06, |
|
"loss": 1.5101, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 84.20186505759737, |
|
"grad_norm": 4.183162212371826, |
|
"learning_rate": 7.899067471201317e-06, |
|
"loss": 1.4981, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 84.47613823368074, |
|
"grad_norm": 4.0559539794921875, |
|
"learning_rate": 7.761930883159627e-06, |
|
"loss": 1.4965, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 84.75041140976413, |
|
"grad_norm": 4.252512454986572, |
|
"learning_rate": 7.624794295117937e-06, |
|
"loss": 1.5019, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 85.02468458584751, |
|
"grad_norm": 3.9115328788757324, |
|
"learning_rate": 7.4876577070762485e-06, |
|
"loss": 1.5138, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 85.29895776193088, |
|
"grad_norm": 4.217545032501221, |
|
"learning_rate": 7.350521119034559e-06, |
|
"loss": 1.492, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 85.57323093801426, |
|
"grad_norm": 3.974954128265381, |
|
"learning_rate": 7.21338453099287e-06, |
|
"loss": 1.4897, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 85.84750411409765, |
|
"grad_norm": 4.266519069671631, |
|
"learning_rate": 7.07624794295118e-06, |
|
"loss": 1.4996, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 86.12177729018102, |
|
"grad_norm": 3.8507697582244873, |
|
"learning_rate": 6.93911135490949e-06, |
|
"loss": 1.4891, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 86.3960504662644, |
|
"grad_norm": 4.050006866455078, |
|
"learning_rate": 6.801974766867801e-06, |
|
"loss": 1.4848, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 86.67032364234778, |
|
"grad_norm": 4.006500720977783, |
|
"learning_rate": 6.664838178826111e-06, |
|
"loss": 1.4946, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 86.94459681843115, |
|
"grad_norm": 4.2527289390563965, |
|
"learning_rate": 6.527701590784421e-06, |
|
"loss": 1.495, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 87.21886999451453, |
|
"grad_norm": 4.087696552276611, |
|
"learning_rate": 6.390565002742732e-06, |
|
"loss": 1.4834, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 87.49314317059792, |
|
"grad_norm": 3.9683475494384766, |
|
"learning_rate": 6.253428414701043e-06, |
|
"loss": 1.48, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 87.7674163466813, |
|
"grad_norm": 4.009182453155518, |
|
"learning_rate": 6.116291826659353e-06, |
|
"loss": 1.4799, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 88.04168952276467, |
|
"grad_norm": 3.9172310829162598, |
|
"learning_rate": 5.979155238617663e-06, |
|
"loss": 1.4827, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 88.31596269884805, |
|
"grad_norm": 3.920940399169922, |
|
"learning_rate": 5.842018650575974e-06, |
|
"loss": 1.4772, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 88.59023587493144, |
|
"grad_norm": 4.178516387939453, |
|
"learning_rate": 5.704882062534284e-06, |
|
"loss": 1.4831, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 88.8645090510148, |
|
"grad_norm": 4.068806171417236, |
|
"learning_rate": 5.567745474492595e-06, |
|
"loss": 1.4796, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 89.13878222709819, |
|
"grad_norm": 3.923023223876953, |
|
"learning_rate": 5.430608886450905e-06, |
|
"loss": 1.4734, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 89.41305540318157, |
|
"grad_norm": 4.0538411140441895, |
|
"learning_rate": 5.293472298409216e-06, |
|
"loss": 1.4675, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 89.68732857926494, |
|
"grad_norm": 4.289505481719971, |
|
"learning_rate": 5.156335710367526e-06, |
|
"loss": 1.4812, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 89.96160175534833, |
|
"grad_norm": 4.2184247970581055, |
|
"learning_rate": 5.019199122325837e-06, |
|
"loss": 1.4761, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 90.23587493143171, |
|
"grad_norm": 4.014777183532715, |
|
"learning_rate": 4.8820625342841474e-06, |
|
"loss": 1.4659, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 90.51014810751508, |
|
"grad_norm": 4.025433540344238, |
|
"learning_rate": 4.744925946242457e-06, |
|
"loss": 1.4701, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 90.78442128359846, |
|
"grad_norm": 4.117000102996826, |
|
"learning_rate": 4.607789358200768e-06, |
|
"loss": 1.4745, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 91.05869445968185, |
|
"grad_norm": 4.047626495361328, |
|
"learning_rate": 4.4706527701590785e-06, |
|
"loss": 1.4693, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 91.33296763576523, |
|
"grad_norm": 4.17887020111084, |
|
"learning_rate": 4.333516182117389e-06, |
|
"loss": 1.4624, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 91.6072408118486, |
|
"grad_norm": 4.2437639236450195, |
|
"learning_rate": 4.196379594075699e-06, |
|
"loss": 1.4665, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 91.88151398793198, |
|
"grad_norm": 3.7711315155029297, |
|
"learning_rate": 4.0592430060340105e-06, |
|
"loss": 1.4731, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 92.15578716401536, |
|
"grad_norm": 4.002791404724121, |
|
"learning_rate": 3.92210641799232e-06, |
|
"loss": 1.4642, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 92.43006034009873, |
|
"grad_norm": 4.0743231773376465, |
|
"learning_rate": 3.7849698299506313e-06, |
|
"loss": 1.4584, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 92.70433351618212, |
|
"grad_norm": 4.080685138702393, |
|
"learning_rate": 3.647833241908941e-06, |
|
"loss": 1.4623, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 92.9786066922655, |
|
"grad_norm": 4.304593563079834, |
|
"learning_rate": 3.510696653867252e-06, |
|
"loss": 1.4682, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 93.25287986834887, |
|
"grad_norm": 4.447428226470947, |
|
"learning_rate": 3.3735600658255624e-06, |
|
"loss": 1.4535, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 93.52715304443225, |
|
"grad_norm": 4.22756814956665, |
|
"learning_rate": 3.236423477783873e-06, |
|
"loss": 1.4592, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 93.80142622051564, |
|
"grad_norm": 4.293380260467529, |
|
"learning_rate": 3.0992868897421833e-06, |
|
"loss": 1.4632, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 94.075699396599, |
|
"grad_norm": 4.07041072845459, |
|
"learning_rate": 2.962150301700494e-06, |
|
"loss": 1.4605, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 94.34997257268239, |
|
"grad_norm": 4.039161205291748, |
|
"learning_rate": 2.825013713658804e-06, |
|
"loss": 1.4551, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 94.62424574876577, |
|
"grad_norm": 4.1246795654296875, |
|
"learning_rate": 2.687877125617115e-06, |
|
"loss": 1.456, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 94.89851892484916, |
|
"grad_norm": 4.026761054992676, |
|
"learning_rate": 2.550740537575425e-06, |
|
"loss": 1.4512, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 95.17279210093253, |
|
"grad_norm": 4.5864715576171875, |
|
"learning_rate": 2.4136039495337357e-06, |
|
"loss": 1.4575, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 95.44706527701591, |
|
"grad_norm": 4.117992401123047, |
|
"learning_rate": 2.2764673614920463e-06, |
|
"loss": 1.4475, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 95.72133845309929, |
|
"grad_norm": 4.155096530914307, |
|
"learning_rate": 2.1393307734503565e-06, |
|
"loss": 1.4581, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 95.99561162918266, |
|
"grad_norm": 4.28767204284668, |
|
"learning_rate": 2.002194185408667e-06, |
|
"loss": 1.4521, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 96.26988480526605, |
|
"grad_norm": 4.1511077880859375, |
|
"learning_rate": 1.8650575973669776e-06, |
|
"loss": 1.4488, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 96.54415798134943, |
|
"grad_norm": 4.336985111236572, |
|
"learning_rate": 1.727921009325288e-06, |
|
"loss": 1.4534, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 96.8184311574328, |
|
"grad_norm": 4.181045055389404, |
|
"learning_rate": 1.5907844212835987e-06, |
|
"loss": 1.45, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 97.09270433351618, |
|
"grad_norm": 4.217624187469482, |
|
"learning_rate": 1.453647833241909e-06, |
|
"loss": 1.4498, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 97.36697750959956, |
|
"grad_norm": 3.8873023986816406, |
|
"learning_rate": 1.3165112452002194e-06, |
|
"loss": 1.4507, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 97.64125068568293, |
|
"grad_norm": 4.3951191902160645, |
|
"learning_rate": 1.17937465715853e-06, |
|
"loss": 1.451, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 97.91552386176632, |
|
"grad_norm": 4.1204118728637695, |
|
"learning_rate": 1.0422380691168404e-06, |
|
"loss": 1.4484, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 98.1897970378497, |
|
"grad_norm": 4.278495788574219, |
|
"learning_rate": 9.05101481075151e-07, |
|
"loss": 1.4431, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 98.46407021393308, |
|
"grad_norm": 4.186399459838867, |
|
"learning_rate": 7.679648930334613e-07, |
|
"loss": 1.4493, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 98.73834339001645, |
|
"grad_norm": 4.110637187957764, |
|
"learning_rate": 6.308283049917719e-07, |
|
"loss": 1.4418, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 99.01261656609984, |
|
"grad_norm": 3.9559993743896484, |
|
"learning_rate": 4.936917169500823e-07, |
|
"loss": 1.4407, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 99.28688974218322, |
|
"grad_norm": 4.4722418785095215, |
|
"learning_rate": 3.565551289083928e-07, |
|
"loss": 1.4421, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 99.56116291826659, |
|
"grad_norm": 4.151792526245117, |
|
"learning_rate": 2.1941854086670326e-07, |
|
"loss": 1.4469, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 99.83543609434997, |
|
"grad_norm": 4.128389835357666, |
|
"learning_rate": 8.228195282501371e-08, |
|
"loss": 1.4463, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"step": 182300, |
|
"total_flos": 3.157662139522744e+17, |
|
"train_loss": 2.261507166926833, |
|
"train_runtime": 62730.7046, |
|
"train_samples_per_second": 185.96, |
|
"train_steps_per_second": 2.906 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 182300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.157662139522744e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|