{ "best_metric": null, "best_model_checkpoint": null, "epoch": 25.0, "eval_steps": 500, "global_step": 306800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04074315514993481, "grad_norm": 1.048214077949524, "learning_rate": 0.0002995110821382007, "loss": 0.3395, "step": 500 }, { "epoch": 0.08148631029986962, "grad_norm": 0.9001195430755615, "learning_rate": 0.00029902216427640153, "loss": 0.3549, "step": 1000 }, { "epoch": 0.12222946544980444, "grad_norm": 1.2574785947799683, "learning_rate": 0.0002985332464146023, "loss": 0.3524, "step": 1500 }, { "epoch": 0.16297262059973924, "grad_norm": 0.6687576174736023, "learning_rate": 0.0002980443285528031, "loss": 0.3576, "step": 2000 }, { "epoch": 0.20371577574967406, "grad_norm": 0.9696186184883118, "learning_rate": 0.0002975554106910039, "loss": 0.3473, "step": 2500 }, { "epoch": 0.24445893089960888, "grad_norm": 0.5164366960525513, "learning_rate": 0.00029706649282920466, "loss": 0.3438, "step": 3000 }, { "epoch": 0.28520208604954367, "grad_norm": 0.6610291600227356, "learning_rate": 0.00029657757496740546, "loss": 0.355, "step": 3500 }, { "epoch": 0.3259452411994785, "grad_norm": 0.7264110445976257, "learning_rate": 0.0002960886571056062, "loss": 0.3448, "step": 4000 }, { "epoch": 0.3666883963494133, "grad_norm": 0.7032458186149597, "learning_rate": 0.000295599739243807, "loss": 0.3503, "step": 4500 }, { "epoch": 0.4074315514993481, "grad_norm": 1.057846188545227, "learning_rate": 0.0002951108213820078, "loss": 0.3401, "step": 5000 }, { "epoch": 0.44817470664928294, "grad_norm": 0.9207776188850403, "learning_rate": 0.0002946219035202086, "loss": 0.3458, "step": 5500 }, { "epoch": 0.48891786179921776, "grad_norm": 0.6475661396980286, "learning_rate": 0.00029413298565840934, "loss": 0.349, "step": 6000 }, { "epoch": 0.5296610169491526, "grad_norm": 0.7293631434440613, "learning_rate": 0.00029364406779661015, "loss": 0.3498, "step": 6500 }, { "epoch": 0.5704041720990873, "grad_norm": 0.7588677406311035, "learning_rate": 0.00029315514993481095, "loss": 0.3359, "step": 7000 }, { "epoch": 0.6111473272490222, "grad_norm": 0.5262191891670227, "learning_rate": 0.0002926662320730117, "loss": 0.3391, "step": 7500 }, { "epoch": 0.651890482398957, "grad_norm": 0.5397419333457947, "learning_rate": 0.0002921773142112125, "loss": 0.3469, "step": 8000 }, { "epoch": 0.6926336375488917, "grad_norm": 1.1125962734222412, "learning_rate": 0.00029168839634941327, "loss": 0.3487, "step": 8500 }, { "epoch": 0.7333767926988266, "grad_norm": 0.7158764004707336, "learning_rate": 0.0002911994784876141, "loss": 0.3513, "step": 9000 }, { "epoch": 0.7741199478487614, "grad_norm": 0.9774956703186035, "learning_rate": 0.00029071056062581483, "loss": 0.3457, "step": 9500 }, { "epoch": 0.8148631029986962, "grad_norm": 1.0996198654174805, "learning_rate": 0.0002902216427640156, "loss": 0.339, "step": 10000 }, { "epoch": 0.855606258148631, "grad_norm": 1.374120831489563, "learning_rate": 0.0002897327249022164, "loss": 0.3497, "step": 10500 }, { "epoch": 0.8963494132985659, "grad_norm": 1.2725900411605835, "learning_rate": 0.0002892438070404172, "loss": 0.3458, "step": 11000 }, { "epoch": 0.9370925684485006, "grad_norm": 0.9719182252883911, "learning_rate": 0.00028875488917861795, "loss": 0.3509, "step": 11500 }, { "epoch": 0.9778357235984355, "grad_norm": 0.9504433870315552, "learning_rate": 0.00028826597131681876, "loss": 0.3456, "step": 12000 }, { "epoch": 1.0, "eval_accuracy": 0.8289156556129456, "eval_loss": 0.46264198422431946, "eval_runtime": 4.4513, "eval_samples_per_second": 559.387, "eval_steps_per_second": 17.523, "step": 12272 }, { "epoch": 1.0185788787483703, "grad_norm": 0.722874104976654, "learning_rate": 0.00028777705345501956, "loss": 0.347, "step": 12500 }, { "epoch": 1.0593220338983051, "grad_norm": 1.4278239011764526, "learning_rate": 0.0002872881355932203, "loss": 0.3436, "step": 13000 }, { "epoch": 1.1000651890482398, "grad_norm": 1.1039665937423706, "learning_rate": 0.00028679921773142107, "loss": 0.3527, "step": 13500 }, { "epoch": 1.1408083441981747, "grad_norm": 0.9053193926811218, "learning_rate": 0.0002863102998696219, "loss": 0.3504, "step": 14000 }, { "epoch": 1.1815514993481095, "grad_norm": 1.2441742420196533, "learning_rate": 0.00028582138200782263, "loss": 0.3556, "step": 14500 }, { "epoch": 1.2222946544980444, "grad_norm": 0.543782114982605, "learning_rate": 0.00028533246414602344, "loss": 0.346, "step": 15000 }, { "epoch": 1.263037809647979, "grad_norm": 1.2369636297225952, "learning_rate": 0.00028484354628422425, "loss": 0.3439, "step": 15500 }, { "epoch": 1.303780964797914, "grad_norm": 0.5305708050727844, "learning_rate": 0.000284354628422425, "loss": 0.3434, "step": 16000 }, { "epoch": 1.3445241199478488, "grad_norm": 0.6448760032653809, "learning_rate": 0.0002838657105606258, "loss": 0.3473, "step": 16500 }, { "epoch": 1.3852672750977835, "grad_norm": 1.762876272201538, "learning_rate": 0.00028337679269882656, "loss": 0.3477, "step": 17000 }, { "epoch": 1.4260104302477183, "grad_norm": 0.6933132410049438, "learning_rate": 0.00028288787483702737, "loss": 0.3508, "step": 17500 }, { "epoch": 1.4667535853976532, "grad_norm": 0.806649923324585, "learning_rate": 0.0002823989569752281, "loss": 0.3508, "step": 18000 }, { "epoch": 1.5074967405475879, "grad_norm": 0.8625214099884033, "learning_rate": 0.00028191003911342893, "loss": 0.3504, "step": 18500 }, { "epoch": 1.548239895697523, "grad_norm": 0.7049874663352966, "learning_rate": 0.0002814211212516297, "loss": 0.3431, "step": 19000 }, { "epoch": 1.5889830508474576, "grad_norm": 0.8473791480064392, "learning_rate": 0.0002809322033898305, "loss": 0.3467, "step": 19500 }, { "epoch": 1.6297262059973925, "grad_norm": 1.294495701789856, "learning_rate": 0.00028044328552803124, "loss": 0.3421, "step": 20000 }, { "epoch": 1.6704693611473274, "grad_norm": 1.0067027807235718, "learning_rate": 0.00027995436766623205, "loss": 0.3437, "step": 20500 }, { "epoch": 1.711212516297262, "grad_norm": 1.0469399690628052, "learning_rate": 0.00027946544980443286, "loss": 0.3418, "step": 21000 }, { "epoch": 1.7519556714471969, "grad_norm": 1.1492327451705933, "learning_rate": 0.0002789765319426336, "loss": 0.3415, "step": 21500 }, { "epoch": 1.7926988265971318, "grad_norm": 0.5382260084152222, "learning_rate": 0.0002784876140808344, "loss": 0.3436, "step": 22000 }, { "epoch": 1.8334419817470664, "grad_norm": 1.650864839553833, "learning_rate": 0.00027799869621903517, "loss": 0.3504, "step": 22500 }, { "epoch": 1.8741851368970013, "grad_norm": 1.287651777267456, "learning_rate": 0.0002775097783572359, "loss": 0.3376, "step": 23000 }, { "epoch": 1.9149282920469362, "grad_norm": 0.8451623320579529, "learning_rate": 0.00027702086049543673, "loss": 0.3441, "step": 23500 }, { "epoch": 1.9556714471968708, "grad_norm": 1.2747808694839478, "learning_rate": 0.00027653194263363754, "loss": 0.3457, "step": 24000 }, { "epoch": 1.996414602346806, "grad_norm": 0.9121665954589844, "learning_rate": 0.0002760430247718383, "loss": 0.3458, "step": 24500 }, { "epoch": 2.0, "eval_accuracy": 0.8253012299537659, "eval_loss": 0.48892053961753845, "eval_runtime": 4.3688, "eval_samples_per_second": 569.948, "eval_steps_per_second": 17.854, "step": 24544 }, { "epoch": 2.0371577574967406, "grad_norm": 1.2752048969268799, "learning_rate": 0.0002755541069100391, "loss": 0.3443, "step": 25000 }, { "epoch": 2.077900912646675, "grad_norm": 0.7506113052368164, "learning_rate": 0.0002750651890482399, "loss": 0.3467, "step": 25500 }, { "epoch": 2.1186440677966103, "grad_norm": 1.016860008239746, "learning_rate": 0.00027457627118644066, "loss": 0.3469, "step": 26000 }, { "epoch": 2.159387222946545, "grad_norm": 0.7216314673423767, "learning_rate": 0.0002740873533246414, "loss": 0.3536, "step": 26500 }, { "epoch": 2.2001303780964796, "grad_norm": 1.0932265520095825, "learning_rate": 0.0002735984354628422, "loss": 0.3476, "step": 27000 }, { "epoch": 2.2408735332464147, "grad_norm": 1.5072846412658691, "learning_rate": 0.000273109517601043, "loss": 0.3426, "step": 27500 }, { "epoch": 2.2816166883963493, "grad_norm": 0.7757501602172852, "learning_rate": 0.0002726205997392438, "loss": 0.3556, "step": 28000 }, { "epoch": 2.322359843546284, "grad_norm": 1.0053937435150146, "learning_rate": 0.00027213168187744454, "loss": 0.3334, "step": 28500 }, { "epoch": 2.363102998696219, "grad_norm": 1.131600022315979, "learning_rate": 0.00027164276401564534, "loss": 0.3467, "step": 29000 }, { "epoch": 2.4038461538461537, "grad_norm": 2.05977201461792, "learning_rate": 0.00027115384615384615, "loss": 0.3389, "step": 29500 }, { "epoch": 2.444589308996089, "grad_norm": 1.1583738327026367, "learning_rate": 0.0002706649282920469, "loss": 0.3449, "step": 30000 }, { "epoch": 2.4853324641460235, "grad_norm": 1.2872370481491089, "learning_rate": 0.0002701760104302477, "loss": 0.3466, "step": 30500 }, { "epoch": 2.526075619295958, "grad_norm": 0.8480736017227173, "learning_rate": 0.00026968709256844847, "loss": 0.3463, "step": 31000 }, { "epoch": 2.5668187744458932, "grad_norm": 1.4083653688430786, "learning_rate": 0.0002691981747066493, "loss": 0.3533, "step": 31500 }, { "epoch": 2.607561929595828, "grad_norm": 1.5343284606933594, "learning_rate": 0.00026870925684485, "loss": 0.337, "step": 32000 }, { "epoch": 2.648305084745763, "grad_norm": 0.559269905090332, "learning_rate": 0.00026822033898305083, "loss": 0.3476, "step": 32500 }, { "epoch": 2.6890482398956976, "grad_norm": 0.504391610622406, "learning_rate": 0.0002677314211212516, "loss": 0.3429, "step": 33000 }, { "epoch": 2.7297913950456323, "grad_norm": 0.9045794606208801, "learning_rate": 0.0002672425032594524, "loss": 0.3439, "step": 33500 }, { "epoch": 2.770534550195567, "grad_norm": 0.6075222492218018, "learning_rate": 0.0002667535853976532, "loss": 0.3498, "step": 34000 }, { "epoch": 2.811277705345502, "grad_norm": 0.6640328168869019, "learning_rate": 0.00026626466753585396, "loss": 0.3494, "step": 34500 }, { "epoch": 2.8520208604954367, "grad_norm": 0.9243645071983337, "learning_rate": 0.00026577574967405476, "loss": 0.3301, "step": 35000 }, { "epoch": 2.8927640156453718, "grad_norm": 0.905765950679779, "learning_rate": 0.0002652868318122555, "loss": 0.3388, "step": 35500 }, { "epoch": 2.9335071707953064, "grad_norm": 0.6984039545059204, "learning_rate": 0.00026479791395045627, "loss": 0.3397, "step": 36000 }, { "epoch": 2.974250325945241, "grad_norm": 1.085003137588501, "learning_rate": 0.0002643089960886571, "loss": 0.3402, "step": 36500 }, { "epoch": 3.0, "eval_accuracy": 0.828514039516449, "eval_loss": 0.4701636731624603, "eval_runtime": 4.3721, "eval_samples_per_second": 569.519, "eval_steps_per_second": 17.84, "step": 36816 }, { "epoch": 3.014993481095176, "grad_norm": 0.4969957172870636, "learning_rate": 0.00026382007822685783, "loss": 0.3489, "step": 37000 }, { "epoch": 3.055736636245111, "grad_norm": 0.6103217005729675, "learning_rate": 0.00026333116036505864, "loss": 0.3456, "step": 37500 }, { "epoch": 3.0964797913950455, "grad_norm": 1.2167404890060425, "learning_rate": 0.00026284224250325945, "loss": 0.3418, "step": 38000 }, { "epoch": 3.1372229465449806, "grad_norm": 0.7105266451835632, "learning_rate": 0.00026235332464146025, "loss": 0.3476, "step": 38500 }, { "epoch": 3.1779661016949152, "grad_norm": 0.8161793947219849, "learning_rate": 0.000261864406779661, "loss": 0.34, "step": 39000 }, { "epoch": 3.21870925684485, "grad_norm": 0.6345824599266052, "learning_rate": 0.00026137548891786176, "loss": 0.3453, "step": 39500 }, { "epoch": 3.259452411994785, "grad_norm": 1.5101888179779053, "learning_rate": 0.00026088657105606257, "loss": 0.3502, "step": 40000 }, { "epoch": 3.3001955671447196, "grad_norm": 0.4510188698768616, "learning_rate": 0.0002603976531942633, "loss": 0.3441, "step": 40500 }, { "epoch": 3.3409387222946547, "grad_norm": 0.7021961808204651, "learning_rate": 0.00025990873533246413, "loss": 0.3422, "step": 41000 }, { "epoch": 3.3816818774445894, "grad_norm": 0.48735475540161133, "learning_rate": 0.0002594198174706649, "loss": 0.3524, "step": 41500 }, { "epoch": 3.422425032594524, "grad_norm": 0.6795012950897217, "learning_rate": 0.0002589308996088657, "loss": 0.3333, "step": 42000 }, { "epoch": 3.463168187744459, "grad_norm": 0.9818280935287476, "learning_rate": 0.0002584419817470665, "loss": 0.3451, "step": 42500 }, { "epoch": 3.5039113428943938, "grad_norm": 0.5878106951713562, "learning_rate": 0.00025795306388526725, "loss": 0.3486, "step": 43000 }, { "epoch": 3.5446544980443284, "grad_norm": 1.0303575992584229, "learning_rate": 0.00025746414602346806, "loss": 0.3415, "step": 43500 }, { "epoch": 3.5853976531942635, "grad_norm": 1.2737740278244019, "learning_rate": 0.0002569752281616688, "loss": 0.3412, "step": 44000 }, { "epoch": 3.626140808344198, "grad_norm": 2.338712215423584, "learning_rate": 0.0002564863102998696, "loss": 0.3427, "step": 44500 }, { "epoch": 3.666883963494133, "grad_norm": 0.7371268272399902, "learning_rate": 0.00025599739243807037, "loss": 0.3507, "step": 45000 }, { "epoch": 3.707627118644068, "grad_norm": 1.27114737033844, "learning_rate": 0.0002555084745762711, "loss": 0.3447, "step": 45500 }, { "epoch": 3.7483702737940026, "grad_norm": 0.7382656335830688, "learning_rate": 0.00025501955671447193, "loss": 0.339, "step": 46000 }, { "epoch": 3.7891134289439377, "grad_norm": 0.837044358253479, "learning_rate": 0.00025453063885267274, "loss": 0.3389, "step": 46500 }, { "epoch": 3.8298565840938723, "grad_norm": 1.1817644834518433, "learning_rate": 0.00025404172099087355, "loss": 0.3472, "step": 47000 }, { "epoch": 3.870599739243807, "grad_norm": 1.0793942213058472, "learning_rate": 0.0002535528031290743, "loss": 0.3503, "step": 47500 }, { "epoch": 3.9113428943937416, "grad_norm": 0.8106686472892761, "learning_rate": 0.0002530638852672751, "loss": 0.3559, "step": 48000 }, { "epoch": 3.9520860495436767, "grad_norm": 0.779579758644104, "learning_rate": 0.00025257496740547586, "loss": 0.3284, "step": 48500 }, { "epoch": 3.9928292046936114, "grad_norm": 1.372778296470642, "learning_rate": 0.0002520860495436766, "loss": 0.3445, "step": 49000 }, { "epoch": 4.0, "eval_accuracy": 0.8333333134651184, "eval_loss": 0.46482688188552856, "eval_runtime": 4.3731, "eval_samples_per_second": 569.392, "eval_steps_per_second": 17.836, "step": 49088 }, { "epoch": 4.0335723598435465, "grad_norm": 0.6263651847839355, "learning_rate": 0.0002515971316818774, "loss": 0.3431, "step": 49500 }, { "epoch": 4.074315514993481, "grad_norm": 1.4384682178497314, "learning_rate": 0.0002511082138200782, "loss": 0.338, "step": 50000 }, { "epoch": 4.115058670143416, "grad_norm": 0.6402746438980103, "learning_rate": 0.000250619295958279, "loss": 0.3394, "step": 50500 }, { "epoch": 4.15580182529335, "grad_norm": 0.9597480893135071, "learning_rate": 0.0002501303780964798, "loss": 0.3325, "step": 51000 }, { "epoch": 4.196544980443286, "grad_norm": 1.1277621984481812, "learning_rate": 0.00024964146023468054, "loss": 0.3374, "step": 51500 }, { "epoch": 4.237288135593221, "grad_norm": 1.1778697967529297, "learning_rate": 0.00024915254237288135, "loss": 0.3409, "step": 52000 }, { "epoch": 4.278031290743155, "grad_norm": 1.0258188247680664, "learning_rate": 0.0002486636245110821, "loss": 0.3345, "step": 52500 }, { "epoch": 4.31877444589309, "grad_norm": 0.7597979307174683, "learning_rate": 0.0002481747066492829, "loss": 0.3424, "step": 53000 }, { "epoch": 4.3595176010430245, "grad_norm": 0.903914749622345, "learning_rate": 0.00024768578878748366, "loss": 0.3484, "step": 53500 }, { "epoch": 4.400260756192959, "grad_norm": 0.9775347709655762, "learning_rate": 0.00024719687092568447, "loss": 0.3428, "step": 54000 }, { "epoch": 4.441003911342895, "grad_norm": 0.5878586769104004, "learning_rate": 0.0002467079530638852, "loss": 0.3406, "step": 54500 }, { "epoch": 4.481747066492829, "grad_norm": 0.5842912197113037, "learning_rate": 0.00024621903520208603, "loss": 0.3421, "step": 55000 }, { "epoch": 4.522490221642764, "grad_norm": 1.2330615520477295, "learning_rate": 0.00024573011734028684, "loss": 0.3372, "step": 55500 }, { "epoch": 4.563233376792699, "grad_norm": 1.010168194770813, "learning_rate": 0.0002452411994784876, "loss": 0.3448, "step": 56000 }, { "epoch": 4.603976531942633, "grad_norm": 1.3469187021255493, "learning_rate": 0.0002447522816166884, "loss": 0.3469, "step": 56500 }, { "epoch": 4.644719687092568, "grad_norm": 1.6847138404846191, "learning_rate": 0.00024426336375488915, "loss": 0.3422, "step": 57000 }, { "epoch": 4.6854628422425035, "grad_norm": 1.3177459239959717, "learning_rate": 0.00024377444589308996, "loss": 0.3493, "step": 57500 }, { "epoch": 4.726205997392438, "grad_norm": 1.14200758934021, "learning_rate": 0.00024328552803129072, "loss": 0.3427, "step": 58000 }, { "epoch": 4.766949152542373, "grad_norm": 1.3425748348236084, "learning_rate": 0.0002427966101694915, "loss": 0.3385, "step": 58500 }, { "epoch": 4.8076923076923075, "grad_norm": 0.4200347363948822, "learning_rate": 0.0002423076923076923, "loss": 0.3533, "step": 59000 }, { "epoch": 4.848435462842242, "grad_norm": 1.5429445505142212, "learning_rate": 0.00024181877444589306, "loss": 0.3395, "step": 59500 }, { "epoch": 4.889178617992178, "grad_norm": 0.685479998588562, "learning_rate": 0.00024132985658409386, "loss": 0.3479, "step": 60000 }, { "epoch": 4.929921773142112, "grad_norm": 1.4265855550765991, "learning_rate": 0.00024084093872229462, "loss": 0.3371, "step": 60500 }, { "epoch": 4.970664928292047, "grad_norm": 0.7832224369049072, "learning_rate": 0.00024035202086049542, "loss": 0.3408, "step": 61000 }, { "epoch": 5.0, "eval_accuracy": 0.8301205039024353, "eval_loss": 0.4700787663459778, "eval_runtime": 4.3757, "eval_samples_per_second": 569.053, "eval_steps_per_second": 17.826, "step": 61360 }, { "epoch": 5.011408083441982, "grad_norm": 2.084968328475952, "learning_rate": 0.0002398631029986962, "loss": 0.3532, "step": 61500 }, { "epoch": 5.052151238591916, "grad_norm": 0.8430765271186829, "learning_rate": 0.00023937418513689696, "loss": 0.3373, "step": 62000 }, { "epoch": 5.092894393741851, "grad_norm": 1.119857907295227, "learning_rate": 0.00023888526727509777, "loss": 0.3365, "step": 62500 }, { "epoch": 5.1336375488917865, "grad_norm": 0.8389126658439636, "learning_rate": 0.00023839634941329855, "loss": 0.3373, "step": 63000 }, { "epoch": 5.174380704041721, "grad_norm": 1.1547911167144775, "learning_rate": 0.00023790743155149933, "loss": 0.3482, "step": 63500 }, { "epoch": 5.215123859191656, "grad_norm": 0.6252550482749939, "learning_rate": 0.0002374185136897001, "loss": 0.3306, "step": 64000 }, { "epoch": 5.25586701434159, "grad_norm": 0.5813185572624207, "learning_rate": 0.0002369295958279009, "loss": 0.3418, "step": 64500 }, { "epoch": 5.296610169491525, "grad_norm": 1.2481329441070557, "learning_rate": 0.00023644067796610167, "loss": 0.3383, "step": 65000 }, { "epoch": 5.337353324641461, "grad_norm": 0.9456360340118408, "learning_rate": 0.00023595176010430245, "loss": 0.3437, "step": 65500 }, { "epoch": 5.378096479791395, "grad_norm": 1.1316335201263428, "learning_rate": 0.00023546284224250326, "loss": 0.3379, "step": 66000 }, { "epoch": 5.41883963494133, "grad_norm": 1.5177615880966187, "learning_rate": 0.000234973924380704, "loss": 0.3405, "step": 66500 }, { "epoch": 5.459582790091265, "grad_norm": 0.9110828638076782, "learning_rate": 0.00023448500651890482, "loss": 0.3419, "step": 67000 }, { "epoch": 5.500325945241199, "grad_norm": 0.6725948452949524, "learning_rate": 0.0002339960886571056, "loss": 0.3435, "step": 67500 }, { "epoch": 5.541069100391134, "grad_norm": 0.9189430475234985, "learning_rate": 0.00023350717079530635, "loss": 0.3501, "step": 68000 }, { "epoch": 5.581812255541069, "grad_norm": 0.8479944467544556, "learning_rate": 0.00023301825293350716, "loss": 0.3374, "step": 68500 }, { "epoch": 5.622555410691004, "grad_norm": 1.310009241104126, "learning_rate": 0.0002325293350717079, "loss": 0.3444, "step": 69000 }, { "epoch": 5.663298565840939, "grad_norm": 0.7890832424163818, "learning_rate": 0.00023204041720990872, "loss": 0.3384, "step": 69500 }, { "epoch": 5.704041720990873, "grad_norm": 1.4012047052383423, "learning_rate": 0.0002315514993481095, "loss": 0.3392, "step": 70000 }, { "epoch": 5.744784876140808, "grad_norm": 0.954657793045044, "learning_rate": 0.0002310625814863103, "loss": 0.3508, "step": 70500 }, { "epoch": 5.7855280312907436, "grad_norm": 1.2328336238861084, "learning_rate": 0.00023057366362451106, "loss": 0.3445, "step": 71000 }, { "epoch": 5.826271186440678, "grad_norm": 0.6325139403343201, "learning_rate": 0.00023008474576271184, "loss": 0.3434, "step": 71500 }, { "epoch": 5.867014341590613, "grad_norm": 0.9840922951698303, "learning_rate": 0.00022959582790091262, "loss": 0.339, "step": 72000 }, { "epoch": 5.9077574967405475, "grad_norm": 2.137343168258667, "learning_rate": 0.0002291069100391134, "loss": 0.3405, "step": 72500 }, { "epoch": 5.948500651890482, "grad_norm": 1.1146010160446167, "learning_rate": 0.0002286179921773142, "loss": 0.3443, "step": 73000 }, { "epoch": 5.989243807040417, "grad_norm": 0.8428038954734802, "learning_rate": 0.00022812907431551496, "loss": 0.341, "step": 73500 }, { "epoch": 6.0, "eval_accuracy": 0.8309236764907837, "eval_loss": 0.46279776096343994, "eval_runtime": 4.3715, "eval_samples_per_second": 569.6, "eval_steps_per_second": 17.843, "step": 73632 }, { "epoch": 6.029986962190352, "grad_norm": 1.0361286401748657, "learning_rate": 0.00022764015645371577, "loss": 0.345, "step": 74000 }, { "epoch": 6.070730117340287, "grad_norm": 1.054185152053833, "learning_rate": 0.00022715123859191655, "loss": 0.3375, "step": 74500 }, { "epoch": 6.111473272490222, "grad_norm": 1.168609619140625, "learning_rate": 0.0002266623207301173, "loss": 0.3403, "step": 75000 }, { "epoch": 6.152216427640156, "grad_norm": 1.6678701639175415, "learning_rate": 0.0002261734028683181, "loss": 0.332, "step": 75500 }, { "epoch": 6.192959582790091, "grad_norm": 1.7210898399353027, "learning_rate": 0.0002256844850065189, "loss": 0.3428, "step": 76000 }, { "epoch": 6.2337027379400265, "grad_norm": 1.0390726327896118, "learning_rate": 0.00022519556714471967, "loss": 0.3524, "step": 76500 }, { "epoch": 6.274445893089961, "grad_norm": 1.272202968597412, "learning_rate": 0.00022470664928292045, "loss": 0.3429, "step": 77000 }, { "epoch": 6.315189048239896, "grad_norm": 1.1143054962158203, "learning_rate": 0.0002242177314211212, "loss": 0.3351, "step": 77500 }, { "epoch": 6.3559322033898304, "grad_norm": 0.7911216020584106, "learning_rate": 0.000223728813559322, "loss": 0.3443, "step": 78000 }, { "epoch": 6.396675358539765, "grad_norm": 1.2170277833938599, "learning_rate": 0.0002232398956975228, "loss": 0.3471, "step": 78500 }, { "epoch": 6.4374185136897, "grad_norm": 1.1739459037780762, "learning_rate": 0.0002227509778357236, "loss": 0.345, "step": 79000 }, { "epoch": 6.478161668839635, "grad_norm": 0.8011351227760315, "learning_rate": 0.00022226205997392435, "loss": 0.345, "step": 79500 }, { "epoch": 6.51890482398957, "grad_norm": 1.370266079902649, "learning_rate": 0.00022177314211212516, "loss": 0.338, "step": 80000 }, { "epoch": 6.559647979139505, "grad_norm": 0.9215847253799438, "learning_rate": 0.00022128422425032591, "loss": 0.341, "step": 80500 }, { "epoch": 6.600391134289439, "grad_norm": 1.4309296607971191, "learning_rate": 0.0002207953063885267, "loss": 0.3418, "step": 81000 }, { "epoch": 6.641134289439374, "grad_norm": 1.3522855043411255, "learning_rate": 0.0002203063885267275, "loss": 0.3408, "step": 81500 }, { "epoch": 6.681877444589309, "grad_norm": 1.0192161798477173, "learning_rate": 0.00021981747066492826, "loss": 0.3458, "step": 82000 }, { "epoch": 6.722620599739244, "grad_norm": 0.7705215215682983, "learning_rate": 0.00021932855280312906, "loss": 0.3343, "step": 82500 }, { "epoch": 6.763363754889179, "grad_norm": 1.3735997676849365, "learning_rate": 0.00021883963494132984, "loss": 0.3412, "step": 83000 }, { "epoch": 6.804106910039113, "grad_norm": 1.2233409881591797, "learning_rate": 0.00021835071707953062, "loss": 0.3464, "step": 83500 }, { "epoch": 6.844850065189048, "grad_norm": 0.8329532146453857, "learning_rate": 0.0002178617992177314, "loss": 0.3303, "step": 84000 }, { "epoch": 6.885593220338983, "grad_norm": 0.7415503859519958, "learning_rate": 0.00021737288135593218, "loss": 0.3304, "step": 84500 }, { "epoch": 6.926336375488918, "grad_norm": 1.0423117876052856, "learning_rate": 0.00021688396349413296, "loss": 0.3334, "step": 85000 }, { "epoch": 6.967079530638853, "grad_norm": 1.0962018966674805, "learning_rate": 0.00021639504563233374, "loss": 0.3275, "step": 85500 }, { "epoch": 7.0, "eval_accuracy": 0.8281124234199524, "eval_loss": 0.47004473209381104, "eval_runtime": 4.3889, "eval_samples_per_second": 567.345, "eval_steps_per_second": 17.772, "step": 85904 }, { "epoch": 7.0078226857887875, "grad_norm": 1.4891513586044312, "learning_rate": 0.00021590612777053455, "loss": 0.3348, "step": 86000 }, { "epoch": 7.048565840938722, "grad_norm": 0.7976107597351074, "learning_rate": 0.0002154172099087353, "loss": 0.3291, "step": 86500 }, { "epoch": 7.089308996088657, "grad_norm": 0.7943911552429199, "learning_rate": 0.0002149282920469361, "loss": 0.3475, "step": 87000 }, { "epoch": 7.130052151238592, "grad_norm": 1.1666449308395386, "learning_rate": 0.0002144393741851369, "loss": 0.3374, "step": 87500 }, { "epoch": 7.170795306388527, "grad_norm": 1.2365397214889526, "learning_rate": 0.00021395045632333765, "loss": 0.3424, "step": 88000 }, { "epoch": 7.211538461538462, "grad_norm": 0.6526191234588623, "learning_rate": 0.00021346153846153845, "loss": 0.3395, "step": 88500 }, { "epoch": 7.252281616688396, "grad_norm": 0.7512989640235901, "learning_rate": 0.0002129726205997392, "loss": 0.3425, "step": 89000 }, { "epoch": 7.293024771838331, "grad_norm": 1.1799145936965942, "learning_rate": 0.00021248370273794002, "loss": 0.3447, "step": 89500 }, { "epoch": 7.333767926988266, "grad_norm": 0.4205518364906311, "learning_rate": 0.0002119947848761408, "loss": 0.3405, "step": 90000 }, { "epoch": 7.374511082138201, "grad_norm": 1.0276716947555542, "learning_rate": 0.00021150586701434155, "loss": 0.341, "step": 90500 }, { "epoch": 7.415254237288136, "grad_norm": 0.9434635639190674, "learning_rate": 0.00021101694915254236, "loss": 0.3276, "step": 91000 }, { "epoch": 7.4559973924380705, "grad_norm": 0.9180013537406921, "learning_rate": 0.00021052803129074314, "loss": 0.3399, "step": 91500 }, { "epoch": 7.496740547588005, "grad_norm": 0.6694702506065369, "learning_rate": 0.00021003911342894392, "loss": 0.3361, "step": 92000 }, { "epoch": 7.53748370273794, "grad_norm": 0.8895887136459351, "learning_rate": 0.0002095501955671447, "loss": 0.3367, "step": 92500 }, { "epoch": 7.578226857887875, "grad_norm": 1.0765182971954346, "learning_rate": 0.0002090612777053455, "loss": 0.3411, "step": 93000 }, { "epoch": 7.61897001303781, "grad_norm": 0.9427884817123413, "learning_rate": 0.00020857235984354626, "loss": 0.3437, "step": 93500 }, { "epoch": 7.659713168187745, "grad_norm": 1.327610969543457, "learning_rate": 0.00020808344198174704, "loss": 0.3268, "step": 94000 }, { "epoch": 7.700456323337679, "grad_norm": 1.140802025794983, "learning_rate": 0.00020759452411994785, "loss": 0.3381, "step": 94500 }, { "epoch": 7.741199478487614, "grad_norm": 0.8230358958244324, "learning_rate": 0.0002071056062581486, "loss": 0.3427, "step": 95000 }, { "epoch": 7.781942633637549, "grad_norm": 1.5203386545181274, "learning_rate": 0.0002066166883963494, "loss": 0.3445, "step": 95500 }, { "epoch": 7.822685788787483, "grad_norm": 1.507284164428711, "learning_rate": 0.0002061277705345502, "loss": 0.3368, "step": 96000 }, { "epoch": 7.863428943937419, "grad_norm": 1.3980096578598022, "learning_rate": 0.00020563885267275097, "loss": 0.3363, "step": 96500 }, { "epoch": 7.904172099087353, "grad_norm": 1.315346598625183, "learning_rate": 0.00020514993481095175, "loss": 0.3437, "step": 97000 }, { "epoch": 7.944915254237288, "grad_norm": 1.565090298652649, "learning_rate": 0.0002046610169491525, "loss": 0.3423, "step": 97500 }, { "epoch": 7.985658409387223, "grad_norm": 1.2532190084457397, "learning_rate": 0.0002041720990873533, "loss": 0.3455, "step": 98000 }, { "epoch": 8.0, "eval_accuracy": 0.8305220603942871, "eval_loss": 0.4571722447872162, "eval_runtime": 4.3848, "eval_samples_per_second": 567.873, "eval_steps_per_second": 17.789, "step": 98176 }, { "epoch": 8.026401564537158, "grad_norm": 0.8833681344985962, "learning_rate": 0.0002036831812255541, "loss": 0.3407, "step": 98500 }, { "epoch": 8.067144719687093, "grad_norm": 1.2567716836929321, "learning_rate": 0.0002031942633637549, "loss": 0.3331, "step": 99000 }, { "epoch": 8.107887874837028, "grad_norm": 0.9826692342758179, "learning_rate": 0.00020270534550195565, "loss": 0.3333, "step": 99500 }, { "epoch": 8.148631029986962, "grad_norm": 1.838377594947815, "learning_rate": 0.00020221642764015643, "loss": 0.34, "step": 100000 }, { "epoch": 8.189374185136897, "grad_norm": 1.4154809713363647, "learning_rate": 0.0002017275097783572, "loss": 0.3342, "step": 100500 }, { "epoch": 8.230117340286832, "grad_norm": 0.9497839212417603, "learning_rate": 0.000201238591916558, "loss": 0.3432, "step": 101000 }, { "epoch": 8.270860495436766, "grad_norm": 0.8343673944473267, "learning_rate": 0.0002007496740547588, "loss": 0.345, "step": 101500 }, { "epoch": 8.3116036505867, "grad_norm": 1.2168527841567993, "learning_rate": 0.00020026075619295955, "loss": 0.3342, "step": 102000 }, { "epoch": 8.352346805736635, "grad_norm": 0.4551219642162323, "learning_rate": 0.00019977183833116036, "loss": 0.3384, "step": 102500 }, { "epoch": 8.393089960886572, "grad_norm": 0.5304436087608337, "learning_rate": 0.00019928292046936114, "loss": 0.3427, "step": 103000 }, { "epoch": 8.433833116036507, "grad_norm": 1.7117105722427368, "learning_rate": 0.0001987940026075619, "loss": 0.3356, "step": 103500 }, { "epoch": 8.474576271186441, "grad_norm": 1.4197205305099487, "learning_rate": 0.0001983050847457627, "loss": 0.3407, "step": 104000 }, { "epoch": 8.515319426336376, "grad_norm": 0.9327645897865295, "learning_rate": 0.00019781616688396345, "loss": 0.3394, "step": 104500 }, { "epoch": 8.55606258148631, "grad_norm": 0.79693204164505, "learning_rate": 0.00019732724902216426, "loss": 0.342, "step": 105000 }, { "epoch": 8.596805736636245, "grad_norm": 1.103314995765686, "learning_rate": 0.00019683833116036504, "loss": 0.3386, "step": 105500 }, { "epoch": 8.63754889178618, "grad_norm": 0.9111559987068176, "learning_rate": 0.00019634941329856585, "loss": 0.3371, "step": 106000 }, { "epoch": 8.678292046936114, "grad_norm": 0.854651153087616, "learning_rate": 0.0001958604954367666, "loss": 0.3524, "step": 106500 }, { "epoch": 8.719035202086049, "grad_norm": 0.692668616771698, "learning_rate": 0.00019537157757496738, "loss": 0.333, "step": 107000 }, { "epoch": 8.759778357235984, "grad_norm": 1.2376559972763062, "learning_rate": 0.00019488265971316816, "loss": 0.3371, "step": 107500 }, { "epoch": 8.800521512385918, "grad_norm": 0.6116431951522827, "learning_rate": 0.00019439374185136894, "loss": 0.3395, "step": 108000 }, { "epoch": 8.841264667535853, "grad_norm": 0.374656617641449, "learning_rate": 0.00019390482398956975, "loss": 0.3325, "step": 108500 }, { "epoch": 8.88200782268579, "grad_norm": 1.369214653968811, "learning_rate": 0.0001934159061277705, "loss": 0.3369, "step": 109000 }, { "epoch": 8.922750977835724, "grad_norm": 0.6732461452484131, "learning_rate": 0.0001929269882659713, "loss": 0.3352, "step": 109500 }, { "epoch": 8.963494132985659, "grad_norm": 0.48688676953315735, "learning_rate": 0.0001924380704041721, "loss": 0.3455, "step": 110000 }, { "epoch": 9.0, "eval_accuracy": 0.8321285247802734, "eval_loss": 0.47947317361831665, "eval_runtime": 4.3745, "eval_samples_per_second": 569.206, "eval_steps_per_second": 17.831, "step": 110448 }, { "epoch": 9.004237288135593, "grad_norm": 0.6902535557746887, "learning_rate": 0.00019194915254237285, "loss": 0.3302, "step": 110500 }, { "epoch": 9.044980443285528, "grad_norm": 0.3573263883590698, "learning_rate": 0.00019146023468057365, "loss": 0.3368, "step": 111000 }, { "epoch": 9.085723598435463, "grad_norm": 1.4043365716934204, "learning_rate": 0.00019097131681877443, "loss": 0.3356, "step": 111500 }, { "epoch": 9.126466753585397, "grad_norm": 0.6768821477890015, "learning_rate": 0.00019048239895697521, "loss": 0.3461, "step": 112000 }, { "epoch": 9.167209908735332, "grad_norm": 0.6116111278533936, "learning_rate": 0.000189993481095176, "loss": 0.3435, "step": 112500 }, { "epoch": 9.207953063885267, "grad_norm": 1.0555490255355835, "learning_rate": 0.00018950456323337675, "loss": 0.3367, "step": 113000 }, { "epoch": 9.248696219035201, "grad_norm": 1.1427255868911743, "learning_rate": 0.00018901564537157755, "loss": 0.3304, "step": 113500 }, { "epoch": 9.289439374185136, "grad_norm": 0.8634780645370483, "learning_rate": 0.00018852672750977834, "loss": 0.3392, "step": 114000 }, { "epoch": 9.330182529335072, "grad_norm": 0.6873227953910828, "learning_rate": 0.00018803780964797914, "loss": 0.347, "step": 114500 }, { "epoch": 9.370925684485007, "grad_norm": 0.634411633014679, "learning_rate": 0.0001875488917861799, "loss": 0.3327, "step": 115000 }, { "epoch": 9.411668839634942, "grad_norm": 0.8212053179740906, "learning_rate": 0.0001870599739243807, "loss": 0.3277, "step": 115500 }, { "epoch": 9.452411994784876, "grad_norm": 1.272691249847412, "learning_rate": 0.00018657105606258146, "loss": 0.3387, "step": 116000 }, { "epoch": 9.493155149934811, "grad_norm": 0.7070391774177551, "learning_rate": 0.00018608213820078224, "loss": 0.3324, "step": 116500 }, { "epoch": 9.533898305084746, "grad_norm": 0.8222519755363464, "learning_rate": 0.00018559322033898304, "loss": 0.3379, "step": 117000 }, { "epoch": 9.57464146023468, "grad_norm": 0.9582119584083557, "learning_rate": 0.0001851043024771838, "loss": 0.3261, "step": 117500 }, { "epoch": 9.615384615384615, "grad_norm": 0.5636938810348511, "learning_rate": 0.0001846153846153846, "loss": 0.3383, "step": 118000 }, { "epoch": 9.65612777053455, "grad_norm": 1.46113920211792, "learning_rate": 0.00018412646675358539, "loss": 0.3271, "step": 118500 }, { "epoch": 9.696870925684484, "grad_norm": 0.9348097443580627, "learning_rate": 0.00018363754889178617, "loss": 0.34, "step": 119000 }, { "epoch": 9.737614080834419, "grad_norm": 0.9395681023597717, "learning_rate": 0.00018314863102998695, "loss": 0.3339, "step": 119500 }, { "epoch": 9.778357235984355, "grad_norm": 0.8072255253791809, "learning_rate": 0.00018265971316818773, "loss": 0.3309, "step": 120000 }, { "epoch": 9.81910039113429, "grad_norm": 0.9370360374450684, "learning_rate": 0.0001821707953063885, "loss": 0.3307, "step": 120500 }, { "epoch": 9.859843546284225, "grad_norm": 1.0267516374588013, "learning_rate": 0.0001816818774445893, "loss": 0.3437, "step": 121000 }, { "epoch": 9.90058670143416, "grad_norm": 1.7827762365341187, "learning_rate": 0.0001811929595827901, "loss": 0.3397, "step": 121500 }, { "epoch": 9.941329856584094, "grad_norm": 1.2578929662704468, "learning_rate": 0.00018070404172099085, "loss": 0.3334, "step": 122000 }, { "epoch": 9.982073011734029, "grad_norm": 0.791420042514801, "learning_rate": 0.00018021512385919163, "loss": 0.3336, "step": 122500 }, { "epoch": 10.0, "eval_accuracy": 0.827309250831604, "eval_loss": 0.4596623182296753, "eval_runtime": 4.3919, "eval_samples_per_second": 566.949, "eval_steps_per_second": 17.76, "step": 122720 }, { "epoch": 10.022816166883963, "grad_norm": 1.7966073751449585, "learning_rate": 0.00017972620599739244, "loss": 0.3316, "step": 123000 }, { "epoch": 10.063559322033898, "grad_norm": 0.5609152913093567, "learning_rate": 0.0001792372881355932, "loss": 0.3394, "step": 123500 }, { "epoch": 10.104302477183833, "grad_norm": 1.34860360622406, "learning_rate": 0.000178748370273794, "loss": 0.3352, "step": 124000 }, { "epoch": 10.145045632333767, "grad_norm": 1.0258673429489136, "learning_rate": 0.00017825945241199475, "loss": 0.3222, "step": 124500 }, { "epoch": 10.185788787483702, "grad_norm": 0.9893328547477722, "learning_rate": 0.00017777053455019556, "loss": 0.3308, "step": 125000 }, { "epoch": 10.226531942633638, "grad_norm": 0.7048326134681702, "learning_rate": 0.00017728161668839634, "loss": 0.3291, "step": 125500 }, { "epoch": 10.267275097783573, "grad_norm": 1.129858136177063, "learning_rate": 0.0001767926988265971, "loss": 0.3281, "step": 126000 }, { "epoch": 10.308018252933508, "grad_norm": 0.575599193572998, "learning_rate": 0.0001763037809647979, "loss": 0.3426, "step": 126500 }, { "epoch": 10.348761408083442, "grad_norm": 0.5909539461135864, "learning_rate": 0.00017581486310299868, "loss": 0.3323, "step": 127000 }, { "epoch": 10.389504563233377, "grad_norm": 0.7857049703598022, "learning_rate": 0.00017532594524119946, "loss": 0.3319, "step": 127500 }, { "epoch": 10.430247718383312, "grad_norm": 0.623603880405426, "learning_rate": 0.00017483702737940024, "loss": 0.3411, "step": 128000 }, { "epoch": 10.470990873533246, "grad_norm": 0.5989457368850708, "learning_rate": 0.00017434810951760105, "loss": 0.3383, "step": 128500 }, { "epoch": 10.51173402868318, "grad_norm": 1.3368146419525146, "learning_rate": 0.0001738591916558018, "loss": 0.3214, "step": 129000 }, { "epoch": 10.552477183833116, "grad_norm": 1.2709711790084839, "learning_rate": 0.00017337027379400258, "loss": 0.3288, "step": 129500 }, { "epoch": 10.59322033898305, "grad_norm": 0.9253243207931519, "learning_rate": 0.0001728813559322034, "loss": 0.3487, "step": 130000 }, { "epoch": 10.633963494132985, "grad_norm": 1.2309492826461792, "learning_rate": 0.00017239243807040414, "loss": 0.3349, "step": 130500 }, { "epoch": 10.674706649282921, "grad_norm": 0.5883442759513855, "learning_rate": 0.00017190352020860495, "loss": 0.3395, "step": 131000 }, { "epoch": 10.715449804432856, "grad_norm": 1.26941978931427, "learning_rate": 0.00017141460234680573, "loss": 0.3287, "step": 131500 }, { "epoch": 10.75619295958279, "grad_norm": 1.5418081283569336, "learning_rate": 0.0001709256844850065, "loss": 0.333, "step": 132000 }, { "epoch": 10.796936114732725, "grad_norm": 0.4535035192966461, "learning_rate": 0.0001704367666232073, "loss": 0.3336, "step": 132500 }, { "epoch": 10.83767926988266, "grad_norm": 0.9787946939468384, "learning_rate": 0.00016994784876140804, "loss": 0.3367, "step": 133000 }, { "epoch": 10.878422425032594, "grad_norm": 1.6416429281234741, "learning_rate": 0.00016945893089960885, "loss": 0.323, "step": 133500 }, { "epoch": 10.91916558018253, "grad_norm": 1.3397718667984009, "learning_rate": 0.00016897001303780963, "loss": 0.3216, "step": 134000 }, { "epoch": 10.959908735332464, "grad_norm": 1.1053659915924072, "learning_rate": 0.00016848109517601044, "loss": 0.3317, "step": 134500 }, { "epoch": 11.0, "eval_accuracy": 0.8309236764907837, "eval_loss": 0.47164076566696167, "eval_runtime": 4.3904, "eval_samples_per_second": 567.144, "eval_steps_per_second": 17.766, "step": 134992 }, { "epoch": 11.000651890482398, "grad_norm": 0.971737802028656, "learning_rate": 0.0001679921773142112, "loss": 0.3428, "step": 135000 }, { "epoch": 11.041395045632333, "grad_norm": 0.875954270362854, "learning_rate": 0.00016750325945241197, "loss": 0.336, "step": 135500 }, { "epoch": 11.082138200782268, "grad_norm": 0.8182882070541382, "learning_rate": 0.00016701434159061275, "loss": 0.3288, "step": 136000 }, { "epoch": 11.122881355932204, "grad_norm": 0.7044540643692017, "learning_rate": 0.00016652542372881353, "loss": 0.3386, "step": 136500 }, { "epoch": 11.163624511082139, "grad_norm": 0.9757317304611206, "learning_rate": 0.00016603650586701434, "loss": 0.3299, "step": 137000 }, { "epoch": 11.204367666232073, "grad_norm": 1.359402060508728, "learning_rate": 0.0001655475880052151, "loss": 0.3193, "step": 137500 }, { "epoch": 11.245110821382008, "grad_norm": 0.8975515961647034, "learning_rate": 0.0001650586701434159, "loss": 0.3386, "step": 138000 }, { "epoch": 11.285853976531943, "grad_norm": 1.7459882497787476, "learning_rate": 0.00016456975228161668, "loss": 0.3276, "step": 138500 }, { "epoch": 11.326597131681877, "grad_norm": 0.5829682946205139, "learning_rate": 0.00016408083441981744, "loss": 0.3324, "step": 139000 }, { "epoch": 11.367340286831812, "grad_norm": 1.0699838399887085, "learning_rate": 0.00016359191655801824, "loss": 0.3313, "step": 139500 }, { "epoch": 11.408083441981747, "grad_norm": 0.8853030800819397, "learning_rate": 0.00016310299869621902, "loss": 0.3398, "step": 140000 }, { "epoch": 11.448826597131681, "grad_norm": 0.5064047574996948, "learning_rate": 0.0001626140808344198, "loss": 0.3429, "step": 140500 }, { "epoch": 11.489569752281616, "grad_norm": 1.093672752380371, "learning_rate": 0.00016212516297262058, "loss": 0.325, "step": 141000 }, { "epoch": 11.53031290743155, "grad_norm": 0.8363606929779053, "learning_rate": 0.0001616362451108214, "loss": 0.334, "step": 141500 }, { "epoch": 11.571056062581487, "grad_norm": 0.8333249688148499, "learning_rate": 0.00016114732724902215, "loss": 0.3292, "step": 142000 }, { "epoch": 11.611799217731422, "grad_norm": 1.1766796112060547, "learning_rate": 0.00016065840938722293, "loss": 0.3392, "step": 142500 }, { "epoch": 11.652542372881356, "grad_norm": 0.8827703595161438, "learning_rate": 0.00016016949152542373, "loss": 0.3317, "step": 143000 }, { "epoch": 11.693285528031291, "grad_norm": 0.5678303241729736, "learning_rate": 0.00015968057366362449, "loss": 0.3392, "step": 143500 }, { "epoch": 11.734028683181226, "grad_norm": 1.255846619606018, "learning_rate": 0.0001591916558018253, "loss": 0.3327, "step": 144000 }, { "epoch": 11.77477183833116, "grad_norm": 1.352667212486267, "learning_rate": 0.00015870273794002605, "loss": 0.3444, "step": 144500 }, { "epoch": 11.815514993481095, "grad_norm": 0.710926353931427, "learning_rate": 0.00015821382007822685, "loss": 0.337, "step": 145000 }, { "epoch": 11.85625814863103, "grad_norm": 1.4295541048049927, "learning_rate": 0.00015772490221642764, "loss": 0.3308, "step": 145500 }, { "epoch": 11.897001303780964, "grad_norm": 1.5580801963806152, "learning_rate": 0.0001572359843546284, "loss": 0.3167, "step": 146000 }, { "epoch": 11.937744458930899, "grad_norm": 0.529028058052063, "learning_rate": 0.0001567470664928292, "loss": 0.3285, "step": 146500 }, { "epoch": 11.978487614080834, "grad_norm": 0.7339088916778564, "learning_rate": 0.00015625814863102998, "loss": 0.33, "step": 147000 }, { "epoch": 12.0, "eval_accuracy": 0.8341365456581116, "eval_loss": 0.460530161857605, "eval_runtime": 4.3899, "eval_samples_per_second": 567.207, "eval_steps_per_second": 17.768, "step": 147264 }, { "epoch": 12.01923076923077, "grad_norm": 0.6954495310783386, "learning_rate": 0.00015576923076923076, "loss": 0.34, "step": 147500 }, { "epoch": 12.059973924380705, "grad_norm": 1.0655009746551514, "learning_rate": 0.00015528031290743154, "loss": 0.3267, "step": 148000 }, { "epoch": 12.10071707953064, "grad_norm": 1.7395973205566406, "learning_rate": 0.00015479139504563232, "loss": 0.3348, "step": 148500 }, { "epoch": 12.141460234680574, "grad_norm": 1.1938823461532593, "learning_rate": 0.0001543024771838331, "loss": 0.3219, "step": 149000 }, { "epoch": 12.182203389830509, "grad_norm": 0.9198336005210876, "learning_rate": 0.00015381355932203388, "loss": 0.3177, "step": 149500 }, { "epoch": 12.222946544980443, "grad_norm": 0.9190293550491333, "learning_rate": 0.00015332464146023469, "loss": 0.3286, "step": 150000 }, { "epoch": 12.263689700130378, "grad_norm": 1.413332462310791, "learning_rate": 0.00015283572359843544, "loss": 0.3277, "step": 150500 }, { "epoch": 12.304432855280313, "grad_norm": 1.3429940938949585, "learning_rate": 0.00015234680573663625, "loss": 0.3334, "step": 151000 }, { "epoch": 12.345176010430247, "grad_norm": 1.7244832515716553, "learning_rate": 0.00015185788787483703, "loss": 0.3343, "step": 151500 }, { "epoch": 12.385919165580182, "grad_norm": 0.8076227307319641, "learning_rate": 0.00015136897001303778, "loss": 0.3319, "step": 152000 }, { "epoch": 12.426662320730117, "grad_norm": 0.8272923827171326, "learning_rate": 0.0001508800521512386, "loss": 0.3289, "step": 152500 }, { "epoch": 12.467405475880053, "grad_norm": 0.8798028826713562, "learning_rate": 0.00015039113428943934, "loss": 0.3331, "step": 153000 }, { "epoch": 12.508148631029988, "grad_norm": 0.9912583231925964, "learning_rate": 0.00014990221642764015, "loss": 0.3337, "step": 153500 }, { "epoch": 12.548891786179922, "grad_norm": 0.6648354530334473, "learning_rate": 0.00014941329856584093, "loss": 0.3309, "step": 154000 }, { "epoch": 12.589634941329857, "grad_norm": 0.5760676860809326, "learning_rate": 0.0001489243807040417, "loss": 0.3285, "step": 154500 }, { "epoch": 12.630378096479792, "grad_norm": 0.7495837807655334, "learning_rate": 0.0001484354628422425, "loss": 0.3368, "step": 155000 }, { "epoch": 12.671121251629726, "grad_norm": 0.9119142293930054, "learning_rate": 0.00014794654498044327, "loss": 0.3264, "step": 155500 }, { "epoch": 12.711864406779661, "grad_norm": 1.2076982259750366, "learning_rate": 0.00014745762711864405, "loss": 0.3337, "step": 156000 }, { "epoch": 12.752607561929596, "grad_norm": 0.8092543482780457, "learning_rate": 0.00014696870925684483, "loss": 0.3269, "step": 156500 }, { "epoch": 12.79335071707953, "grad_norm": 1.0198373794555664, "learning_rate": 0.0001464797913950456, "loss": 0.3256, "step": 157000 }, { "epoch": 12.834093872229465, "grad_norm": 1.0884003639221191, "learning_rate": 0.0001459908735332464, "loss": 0.3257, "step": 157500 }, { "epoch": 12.8748370273794, "grad_norm": 1.0073093175888062, "learning_rate": 0.00014550195567144717, "loss": 0.3315, "step": 158000 }, { "epoch": 12.915580182529336, "grad_norm": 1.1232950687408447, "learning_rate": 0.00014501303780964798, "loss": 0.3292, "step": 158500 }, { "epoch": 12.95632333767927, "grad_norm": 0.5717695355415344, "learning_rate": 0.00014452411994784876, "loss": 0.3288, "step": 159000 }, { "epoch": 12.997066492829205, "grad_norm": 0.915634274482727, "learning_rate": 0.00014403520208604954, "loss": 0.3279, "step": 159500 }, { "epoch": 13.0, "eval_accuracy": 0.8265060186386108, "eval_loss": 0.48236921429634094, "eval_runtime": 4.3702, "eval_samples_per_second": 569.764, "eval_steps_per_second": 17.848, "step": 159536 }, { "epoch": 13.03780964797914, "grad_norm": 1.048601746559143, "learning_rate": 0.00014354628422425032, "loss": 0.3338, "step": 160000 }, { "epoch": 13.078552803129075, "grad_norm": 0.7594525814056396, "learning_rate": 0.0001430573663624511, "loss": 0.3275, "step": 160500 }, { "epoch": 13.11929595827901, "grad_norm": 0.9922093749046326, "learning_rate": 0.00014256844850065188, "loss": 0.3306, "step": 161000 }, { "epoch": 13.160039113428944, "grad_norm": 0.9724846482276917, "learning_rate": 0.00014207953063885266, "loss": 0.3252, "step": 161500 }, { "epoch": 13.200782268578878, "grad_norm": 0.9133873581886292, "learning_rate": 0.00014159061277705344, "loss": 0.3355, "step": 162000 }, { "epoch": 13.241525423728813, "grad_norm": 1.273905634880066, "learning_rate": 0.00014110169491525422, "loss": 0.3395, "step": 162500 }, { "epoch": 13.282268578878748, "grad_norm": 1.0346261262893677, "learning_rate": 0.000140612777053455, "loss": 0.3348, "step": 163000 }, { "epoch": 13.323011734028682, "grad_norm": 1.0091133117675781, "learning_rate": 0.00014012385919165578, "loss": 0.3245, "step": 163500 }, { "epoch": 13.363754889178619, "grad_norm": 0.878383457660675, "learning_rate": 0.00013963494132985656, "loss": 0.3327, "step": 164000 }, { "epoch": 13.404498044328554, "grad_norm": 1.2117908000946045, "learning_rate": 0.00013914602346805734, "loss": 0.3217, "step": 164500 }, { "epoch": 13.445241199478488, "grad_norm": 0.9608532190322876, "learning_rate": 0.00013865710560625815, "loss": 0.3274, "step": 165000 }, { "epoch": 13.485984354628423, "grad_norm": 1.069478988647461, "learning_rate": 0.00013816818774445893, "loss": 0.3292, "step": 165500 }, { "epoch": 13.526727509778357, "grad_norm": 0.9976648688316345, "learning_rate": 0.00013767926988265969, "loss": 0.3364, "step": 166000 }, { "epoch": 13.567470664928292, "grad_norm": 1.489258050918579, "learning_rate": 0.00013719035202086047, "loss": 0.3225, "step": 166500 }, { "epoch": 13.608213820078227, "grad_norm": 0.9805618524551392, "learning_rate": 0.00013670143415906127, "loss": 0.332, "step": 167000 }, { "epoch": 13.648956975228161, "grad_norm": 0.8128567934036255, "learning_rate": 0.00013621251629726205, "loss": 0.3303, "step": 167500 }, { "epoch": 13.689700130378096, "grad_norm": 0.8062760233879089, "learning_rate": 0.00013572359843546283, "loss": 0.3278, "step": 168000 }, { "epoch": 13.73044328552803, "grad_norm": 0.592583179473877, "learning_rate": 0.00013523468057366361, "loss": 0.3285, "step": 168500 }, { "epoch": 13.771186440677965, "grad_norm": 0.9302377104759216, "learning_rate": 0.0001347457627118644, "loss": 0.3321, "step": 169000 }, { "epoch": 13.811929595827902, "grad_norm": 1.5042191743850708, "learning_rate": 0.00013425684485006517, "loss": 0.3246, "step": 169500 }, { "epoch": 13.852672750977836, "grad_norm": 1.7152433395385742, "learning_rate": 0.00013376792698826596, "loss": 0.3256, "step": 170000 }, { "epoch": 13.893415906127771, "grad_norm": 1.5890763998031616, "learning_rate": 0.00013327900912646674, "loss": 0.3222, "step": 170500 }, { "epoch": 13.934159061277706, "grad_norm": 0.4347144663333893, "learning_rate": 0.00013279009126466752, "loss": 0.3399, "step": 171000 }, { "epoch": 13.97490221642764, "grad_norm": 0.8775588870048523, "learning_rate": 0.00013230117340286832, "loss": 0.323, "step": 171500 }, { "epoch": 14.0, "eval_accuracy": 0.828514039516449, "eval_loss": 0.4633678197860718, "eval_runtime": 4.3868, "eval_samples_per_second": 567.612, "eval_steps_per_second": 17.781, "step": 171808 }, { "epoch": 14.015645371577575, "grad_norm": 0.7800391912460327, "learning_rate": 0.0001318122555410691, "loss": 0.3366, "step": 172000 }, { "epoch": 14.05638852672751, "grad_norm": 0.5859417915344238, "learning_rate": 0.00013132333767926986, "loss": 0.3259, "step": 172500 }, { "epoch": 14.097131681877444, "grad_norm": 0.5710736513137817, "learning_rate": 0.00013083441981747064, "loss": 0.3154, "step": 173000 }, { "epoch": 14.137874837027379, "grad_norm": 0.9481617212295532, "learning_rate": 0.00013034550195567145, "loss": 0.3276, "step": 173500 }, { "epoch": 14.178617992177314, "grad_norm": 0.5363568067550659, "learning_rate": 0.00012985658409387223, "loss": 0.3233, "step": 174000 }, { "epoch": 14.219361147327248, "grad_norm": 0.9244194030761719, "learning_rate": 0.000129367666232073, "loss": 0.3408, "step": 174500 }, { "epoch": 14.260104302477185, "grad_norm": 0.5331242680549622, "learning_rate": 0.00012887874837027379, "loss": 0.3266, "step": 175000 }, { "epoch": 14.30084745762712, "grad_norm": 1.1348767280578613, "learning_rate": 0.00012838983050847457, "loss": 0.33, "step": 175500 }, { "epoch": 14.341590612777054, "grad_norm": 1.0903972387313843, "learning_rate": 0.00012790091264667535, "loss": 0.3337, "step": 176000 }, { "epoch": 14.382333767926989, "grad_norm": 0.8601579070091248, "learning_rate": 0.00012741199478487613, "loss": 0.3322, "step": 176500 }, { "epoch": 14.423076923076923, "grad_norm": 1.5855698585510254, "learning_rate": 0.0001269230769230769, "loss": 0.3293, "step": 177000 }, { "epoch": 14.463820078226858, "grad_norm": 0.652545154094696, "learning_rate": 0.0001264341590612777, "loss": 0.322, "step": 177500 }, { "epoch": 14.504563233376793, "grad_norm": 0.8449950218200684, "learning_rate": 0.00012594524119947847, "loss": 0.3229, "step": 178000 }, { "epoch": 14.545306388526727, "grad_norm": 1.0212146043777466, "learning_rate": 0.00012545632333767928, "loss": 0.3314, "step": 178500 }, { "epoch": 14.586049543676662, "grad_norm": 1.18307363986969, "learning_rate": 0.00012496740547588003, "loss": 0.3199, "step": 179000 }, { "epoch": 14.626792698826597, "grad_norm": 0.8406041264533997, "learning_rate": 0.0001244784876140808, "loss": 0.3224, "step": 179500 }, { "epoch": 14.667535853976531, "grad_norm": 1.1103544235229492, "learning_rate": 0.0001239895697522816, "loss": 0.3235, "step": 180000 }, { "epoch": 14.708279009126468, "grad_norm": 0.6465230584144592, "learning_rate": 0.0001235006518904824, "loss": 0.3277, "step": 180500 }, { "epoch": 14.749022164276402, "grad_norm": 0.8634164929389954, "learning_rate": 0.00012301173402868318, "loss": 0.3355, "step": 181000 }, { "epoch": 14.789765319426337, "grad_norm": 1.0167428255081177, "learning_rate": 0.00012252281616688396, "loss": 0.3188, "step": 181500 }, { "epoch": 14.830508474576272, "grad_norm": 1.4421719312667847, "learning_rate": 0.00012203389830508474, "loss": 0.3403, "step": 182000 }, { "epoch": 14.871251629726206, "grad_norm": 0.5863448977470398, "learning_rate": 0.0001215449804432855, "loss": 0.3354, "step": 182500 }, { "epoch": 14.911994784876141, "grad_norm": 1.3666949272155762, "learning_rate": 0.0001210560625814863, "loss": 0.336, "step": 183000 }, { "epoch": 14.952737940026076, "grad_norm": 0.9059769511222839, "learning_rate": 0.00012056714471968708, "loss": 0.3282, "step": 183500 }, { "epoch": 14.99348109517601, "grad_norm": 0.7121485471725464, "learning_rate": 0.00012007822685788786, "loss": 0.3276, "step": 184000 }, { "epoch": 15.0, "eval_accuracy": 0.8305220603942871, "eval_loss": 0.4875960946083069, "eval_runtime": 4.3921, "eval_samples_per_second": 566.933, "eval_steps_per_second": 17.759, "step": 184080 }, { "epoch": 15.034224250325945, "grad_norm": 1.759994626045227, "learning_rate": 0.00011958930899608865, "loss": 0.3229, "step": 184500 }, { "epoch": 15.07496740547588, "grad_norm": 0.9209038615226746, "learning_rate": 0.00011910039113428943, "loss": 0.3313, "step": 185000 }, { "epoch": 15.115710560625814, "grad_norm": 0.9130190014839172, "learning_rate": 0.0001186114732724902, "loss": 0.3219, "step": 185500 }, { "epoch": 15.156453715775749, "grad_norm": 1.3336000442504883, "learning_rate": 0.000118122555410691, "loss": 0.3276, "step": 186000 }, { "epoch": 15.197196870925685, "grad_norm": 0.8584662079811096, "learning_rate": 0.00011763363754889178, "loss": 0.3264, "step": 186500 }, { "epoch": 15.23794002607562, "grad_norm": 0.6862226724624634, "learning_rate": 0.00011714471968709256, "loss": 0.334, "step": 187000 }, { "epoch": 15.278683181225555, "grad_norm": 1.0738499164581299, "learning_rate": 0.00011665580182529335, "loss": 0.3245, "step": 187500 }, { "epoch": 15.31942633637549, "grad_norm": 1.630626916885376, "learning_rate": 0.00011616688396349413, "loss": 0.3158, "step": 188000 }, { "epoch": 15.360169491525424, "grad_norm": 0.887387752532959, "learning_rate": 0.00011567796610169491, "loss": 0.3363, "step": 188500 }, { "epoch": 15.400912646675359, "grad_norm": 0.6818490624427795, "learning_rate": 0.00011518904823989568, "loss": 0.3306, "step": 189000 }, { "epoch": 15.441655801825293, "grad_norm": 0.9173839688301086, "learning_rate": 0.00011470013037809647, "loss": 0.3343, "step": 189500 }, { "epoch": 15.482398956975228, "grad_norm": 0.7407472133636475, "learning_rate": 0.00011421121251629725, "loss": 0.3241, "step": 190000 }, { "epoch": 15.523142112125162, "grad_norm": 0.8950464129447937, "learning_rate": 0.00011372229465449803, "loss": 0.3384, "step": 190500 }, { "epoch": 15.563885267275097, "grad_norm": 0.8162828087806702, "learning_rate": 0.00011323337679269883, "loss": 0.3295, "step": 191000 }, { "epoch": 15.604628422425032, "grad_norm": 1.3588676452636719, "learning_rate": 0.0001127444589308996, "loss": 0.3259, "step": 191500 }, { "epoch": 15.645371577574968, "grad_norm": 1.1393864154815674, "learning_rate": 0.00011225554106910037, "loss": 0.3238, "step": 192000 }, { "epoch": 15.686114732724903, "grad_norm": 0.7090200781822205, "learning_rate": 0.00011176662320730115, "loss": 0.3252, "step": 192500 }, { "epoch": 15.726857887874838, "grad_norm": 1.565515398979187, "learning_rate": 0.00011127770534550195, "loss": 0.3332, "step": 193000 }, { "epoch": 15.767601043024772, "grad_norm": 1.2529878616333008, "learning_rate": 0.00011078878748370273, "loss": 0.3202, "step": 193500 }, { "epoch": 15.808344198174707, "grad_norm": 0.6468512415885925, "learning_rate": 0.00011029986962190351, "loss": 0.3232, "step": 194000 }, { "epoch": 15.849087353324641, "grad_norm": 0.9099129438400269, "learning_rate": 0.0001098109517601043, "loss": 0.3272, "step": 194500 }, { "epoch": 15.889830508474576, "grad_norm": 1.2629077434539795, "learning_rate": 0.00010932203389830507, "loss": 0.3259, "step": 195000 }, { "epoch": 15.93057366362451, "grad_norm": 0.9049485921859741, "learning_rate": 0.00010883311603650585, "loss": 0.3288, "step": 195500 }, { "epoch": 15.971316818774445, "grad_norm": 1.0501422882080078, "learning_rate": 0.00010834419817470663, "loss": 0.3254, "step": 196000 }, { "epoch": 16.0, "eval_accuracy": 0.8297188878059387, "eval_loss": 0.46579205989837646, "eval_runtime": 4.3789, "eval_samples_per_second": 568.632, "eval_steps_per_second": 17.813, "step": 196352 }, { "epoch": 16.01205997392438, "grad_norm": 0.9034555554389954, "learning_rate": 0.00010785528031290742, "loss": 0.3223, "step": 196500 }, { "epoch": 16.052803129074317, "grad_norm": 0.5861558318138123, "learning_rate": 0.0001073663624511082, "loss": 0.3255, "step": 197000 }, { "epoch": 16.09354628422425, "grad_norm": 1.5219391584396362, "learning_rate": 0.00010687744458930898, "loss": 0.3214, "step": 197500 }, { "epoch": 16.134289439374186, "grad_norm": 1.1629459857940674, "learning_rate": 0.00010638852672750978, "loss": 0.3252, "step": 198000 }, { "epoch": 16.17503259452412, "grad_norm": 1.6555060148239136, "learning_rate": 0.00010589960886571055, "loss": 0.3254, "step": 198500 }, { "epoch": 16.215775749674055, "grad_norm": 0.6523577570915222, "learning_rate": 0.00010541069100391133, "loss": 0.3314, "step": 199000 }, { "epoch": 16.256518904823988, "grad_norm": 1.456127405166626, "learning_rate": 0.00010492177314211212, "loss": 0.3255, "step": 199500 }, { "epoch": 16.297262059973924, "grad_norm": 0.7522153854370117, "learning_rate": 0.0001044328552803129, "loss": 0.3361, "step": 200000 }, { "epoch": 16.33800521512386, "grad_norm": 0.6135265827178955, "learning_rate": 0.00010394393741851368, "loss": 0.3207, "step": 200500 }, { "epoch": 16.378748370273794, "grad_norm": 1.0303337574005127, "learning_rate": 0.00010345501955671447, "loss": 0.3254, "step": 201000 }, { "epoch": 16.41949152542373, "grad_norm": 1.0342708826065063, "learning_rate": 0.00010296610169491524, "loss": 0.3197, "step": 201500 }, { "epoch": 16.460234680573663, "grad_norm": 1.210872769355774, "learning_rate": 0.00010247718383311602, "loss": 0.3266, "step": 202000 }, { "epoch": 16.5009778357236, "grad_norm": 0.6689710021018982, "learning_rate": 0.0001019882659713168, "loss": 0.3238, "step": 202500 }, { "epoch": 16.541720990873532, "grad_norm": 0.9645212292671204, "learning_rate": 0.0001014993481095176, "loss": 0.3235, "step": 203000 }, { "epoch": 16.58246414602347, "grad_norm": 1.419161081314087, "learning_rate": 0.00010101043024771838, "loss": 0.3344, "step": 203500 }, { "epoch": 16.6232073011734, "grad_norm": 0.865119457244873, "learning_rate": 0.00010052151238591916, "loss": 0.318, "step": 204000 }, { "epoch": 16.663950456323338, "grad_norm": 0.9382394552230835, "learning_rate": 0.00010003259452411995, "loss": 0.3249, "step": 204500 }, { "epoch": 16.70469361147327, "grad_norm": 2.3167665004730225, "learning_rate": 9.954367666232072e-05, "loss": 0.3269, "step": 205000 }, { "epoch": 16.745436766623207, "grad_norm": 0.7869550585746765, "learning_rate": 9.90547588005215e-05, "loss": 0.3335, "step": 205500 }, { "epoch": 16.786179921773144, "grad_norm": 0.726779580116272, "learning_rate": 9.856584093872228e-05, "loss": 0.3257, "step": 206000 }, { "epoch": 16.826923076923077, "grad_norm": 1.0765122175216675, "learning_rate": 9.807692307692307e-05, "loss": 0.3226, "step": 206500 }, { "epoch": 16.867666232073013, "grad_norm": 1.3420048952102661, "learning_rate": 9.758800521512385e-05, "loss": 0.3432, "step": 207000 }, { "epoch": 16.908409387222946, "grad_norm": 1.4477566480636597, "learning_rate": 9.709908735332463e-05, "loss": 0.3311, "step": 207500 }, { "epoch": 16.949152542372882, "grad_norm": 0.8946223855018616, "learning_rate": 9.661016949152541e-05, "loss": 0.3261, "step": 208000 }, { "epoch": 16.989895697522815, "grad_norm": 0.7570343017578125, "learning_rate": 9.61212516297262e-05, "loss": 0.3349, "step": 208500 }, { "epoch": 17.0, "eval_accuracy": 0.8297188878059387, "eval_loss": 0.47132444381713867, "eval_runtime": 4.4001, "eval_samples_per_second": 565.899, "eval_steps_per_second": 17.727, "step": 208624 }, { "epoch": 17.03063885267275, "grad_norm": 1.1835086345672607, "learning_rate": 9.563233376792697e-05, "loss": 0.3163, "step": 209000 }, { "epoch": 17.071382007822685, "grad_norm": 0.8018409013748169, "learning_rate": 9.514341590612777e-05, "loss": 0.3243, "step": 209500 }, { "epoch": 17.11212516297262, "grad_norm": 0.5500791668891907, "learning_rate": 9.465449804432855e-05, "loss": 0.3307, "step": 210000 }, { "epoch": 17.152868318122554, "grad_norm": 0.5799237489700317, "learning_rate": 9.416558018252933e-05, "loss": 0.3219, "step": 210500 }, { "epoch": 17.19361147327249, "grad_norm": 1.325828194618225, "learning_rate": 9.367666232073012e-05, "loss": 0.3343, "step": 211000 }, { "epoch": 17.234354628422427, "grad_norm": 0.8172540068626404, "learning_rate": 9.318774445893089e-05, "loss": 0.3242, "step": 211500 }, { "epoch": 17.27509778357236, "grad_norm": 0.7740733027458191, "learning_rate": 9.269882659713167e-05, "loss": 0.3235, "step": 212000 }, { "epoch": 17.315840938722296, "grad_norm": 0.7501022219657898, "learning_rate": 9.220990873533245e-05, "loss": 0.3276, "step": 212500 }, { "epoch": 17.35658409387223, "grad_norm": 0.9696376919746399, "learning_rate": 9.172099087353324e-05, "loss": 0.3328, "step": 213000 }, { "epoch": 17.397327249022165, "grad_norm": 1.0730565786361694, "learning_rate": 9.123207301173403e-05, "loss": 0.3125, "step": 213500 }, { "epoch": 17.438070404172098, "grad_norm": 0.5296037197113037, "learning_rate": 9.07431551499348e-05, "loss": 0.3296, "step": 214000 }, { "epoch": 17.478813559322035, "grad_norm": 1.0855183601379395, "learning_rate": 9.025423728813557e-05, "loss": 0.3302, "step": 214500 }, { "epoch": 17.519556714471967, "grad_norm": 1.0939055681228638, "learning_rate": 8.976531942633637e-05, "loss": 0.3355, "step": 215000 }, { "epoch": 17.560299869621904, "grad_norm": 0.5262224078178406, "learning_rate": 8.927640156453715e-05, "loss": 0.3295, "step": 215500 }, { "epoch": 17.601043024771837, "grad_norm": 0.7847169637680054, "learning_rate": 8.878748370273793e-05, "loss": 0.3328, "step": 216000 }, { "epoch": 17.641786179921773, "grad_norm": 0.9434344172477722, "learning_rate": 8.829856584093872e-05, "loss": 0.3322, "step": 216500 }, { "epoch": 17.68252933507171, "grad_norm": 1.4078611135482788, "learning_rate": 8.78096479791395e-05, "loss": 0.3187, "step": 217000 }, { "epoch": 17.723272490221643, "grad_norm": 1.1798319816589355, "learning_rate": 8.732073011734028e-05, "loss": 0.3317, "step": 217500 }, { "epoch": 17.76401564537158, "grad_norm": 0.8744020462036133, "learning_rate": 8.683181225554106e-05, "loss": 0.3283, "step": 218000 }, { "epoch": 17.804758800521512, "grad_norm": 0.4589689075946808, "learning_rate": 8.634289439374184e-05, "loss": 0.3385, "step": 218500 }, { "epoch": 17.84550195567145, "grad_norm": 0.6915792226791382, "learning_rate": 8.585397653194262e-05, "loss": 0.3337, "step": 219000 }, { "epoch": 17.88624511082138, "grad_norm": 0.8095298409461975, "learning_rate": 8.536505867014342e-05, "loss": 0.3311, "step": 219500 }, { "epoch": 17.926988265971318, "grad_norm": 1.077532172203064, "learning_rate": 8.48761408083442e-05, "loss": 0.3244, "step": 220000 }, { "epoch": 17.96773142112125, "grad_norm": 0.9005799889564514, "learning_rate": 8.438722294654498e-05, "loss": 0.334, "step": 220500 }, { "epoch": 18.0, "eval_accuracy": 0.828514039516449, "eval_loss": 0.47592833638191223, "eval_runtime": 4.3941, "eval_samples_per_second": 566.671, "eval_steps_per_second": 17.751, "step": 220896 }, { "epoch": 18.008474576271187, "grad_norm": 0.985079824924469, "learning_rate": 8.389830508474574e-05, "loss": 0.333, "step": 221000 }, { "epoch": 18.04921773142112, "grad_norm": 0.5659169554710388, "learning_rate": 8.340938722294654e-05, "loss": 0.3264, "step": 221500 }, { "epoch": 18.089960886571056, "grad_norm": 0.9319465160369873, "learning_rate": 8.292046936114732e-05, "loss": 0.3227, "step": 222000 }, { "epoch": 18.130704041720993, "grad_norm": 0.5947886109352112, "learning_rate": 8.24315514993481e-05, "loss": 0.3338, "step": 222500 }, { "epoch": 18.171447196870925, "grad_norm": 1.0374130010604858, "learning_rate": 8.194263363754889e-05, "loss": 0.3353, "step": 223000 }, { "epoch": 18.212190352020862, "grad_norm": 0.5126400589942932, "learning_rate": 8.145371577574967e-05, "loss": 0.3276, "step": 223500 }, { "epoch": 18.252933507170795, "grad_norm": 0.9022419452667236, "learning_rate": 8.096479791395044e-05, "loss": 0.3312, "step": 224000 }, { "epoch": 18.29367666232073, "grad_norm": 0.836788535118103, "learning_rate": 8.047588005215122e-05, "loss": 0.3338, "step": 224500 }, { "epoch": 18.334419817470664, "grad_norm": 0.8851606845855713, "learning_rate": 7.998696219035201e-05, "loss": 0.325, "step": 225000 }, { "epoch": 18.3751629726206, "grad_norm": 0.8381347060203552, "learning_rate": 7.94980443285528e-05, "loss": 0.3364, "step": 225500 }, { "epoch": 18.415906127770533, "grad_norm": 0.8798679709434509, "learning_rate": 7.900912646675358e-05, "loss": 0.3379, "step": 226000 }, { "epoch": 18.45664928292047, "grad_norm": 1.894896388053894, "learning_rate": 7.852020860495437e-05, "loss": 0.3282, "step": 226500 }, { "epoch": 18.497392438070403, "grad_norm": 0.8871601819992065, "learning_rate": 7.803129074315515e-05, "loss": 0.3345, "step": 227000 }, { "epoch": 18.53813559322034, "grad_norm": 0.7834600210189819, "learning_rate": 7.754237288135592e-05, "loss": 0.3355, "step": 227500 }, { "epoch": 18.578878748370272, "grad_norm": 0.6276616454124451, "learning_rate": 7.70534550195567e-05, "loss": 0.3283, "step": 228000 }, { "epoch": 18.61962190352021, "grad_norm": 0.6137419939041138, "learning_rate": 7.656453715775749e-05, "loss": 0.339, "step": 228500 }, { "epoch": 18.660365058670145, "grad_norm": 1.0490261316299438, "learning_rate": 7.607561929595827e-05, "loss": 0.3448, "step": 229000 }, { "epoch": 18.701108213820078, "grad_norm": 1.4338812828063965, "learning_rate": 7.558670143415905e-05, "loss": 0.3292, "step": 229500 }, { "epoch": 18.741851368970014, "grad_norm": 1.796556830406189, "learning_rate": 7.509778357235985e-05, "loss": 0.3315, "step": 230000 }, { "epoch": 18.782594524119947, "grad_norm": 1.110555648803711, "learning_rate": 7.460886571056063e-05, "loss": 0.3247, "step": 230500 }, { "epoch": 18.823337679269883, "grad_norm": 1.186569094657898, "learning_rate": 7.41199478487614e-05, "loss": 0.3425, "step": 231000 }, { "epoch": 18.864080834419816, "grad_norm": 1.5170689821243286, "learning_rate": 7.363102998696219e-05, "loss": 0.3387, "step": 231500 }, { "epoch": 18.904823989569753, "grad_norm": 1.5822056531906128, "learning_rate": 7.314211212516297e-05, "loss": 0.3297, "step": 232000 }, { "epoch": 18.945567144719686, "grad_norm": 2.036545515060425, "learning_rate": 7.265319426336375e-05, "loss": 0.3214, "step": 232500 }, { "epoch": 18.986310299869622, "grad_norm": 1.2409448623657227, "learning_rate": 7.216427640156453e-05, "loss": 0.3319, "step": 233000 }, { "epoch": 19.0, "eval_accuracy": 0.8321285247802734, "eval_loss": 0.46636953949928284, "eval_runtime": 4.3952, "eval_samples_per_second": 566.521, "eval_steps_per_second": 17.746, "step": 233168 }, { "epoch": 19.02705345501956, "grad_norm": 0.7545664310455322, "learning_rate": 7.167535853976531e-05, "loss": 0.3396, "step": 233500 }, { "epoch": 19.06779661016949, "grad_norm": 0.8894630074501038, "learning_rate": 7.11864406779661e-05, "loss": 0.3408, "step": 234000 }, { "epoch": 19.108539765319428, "grad_norm": 0.8920040726661682, "learning_rate": 7.069752281616687e-05, "loss": 0.3314, "step": 234500 }, { "epoch": 19.14928292046936, "grad_norm": 1.6172934770584106, "learning_rate": 7.020860495436766e-05, "loss": 0.3239, "step": 235000 }, { "epoch": 19.190026075619297, "grad_norm": 1.1727477312088013, "learning_rate": 6.971968709256844e-05, "loss": 0.3316, "step": 235500 }, { "epoch": 19.23076923076923, "grad_norm": 0.7549859881401062, "learning_rate": 6.923076923076922e-05, "loss": 0.3374, "step": 236000 }, { "epoch": 19.271512385919166, "grad_norm": 0.7747101187705994, "learning_rate": 6.874185136897e-05, "loss": 0.3369, "step": 236500 }, { "epoch": 19.3122555410691, "grad_norm": 1.9304721355438232, "learning_rate": 6.825293350717078e-05, "loss": 0.3481, "step": 237000 }, { "epoch": 19.352998696219036, "grad_norm": 1.1737693548202515, "learning_rate": 6.776401564537158e-05, "loss": 0.3386, "step": 237500 }, { "epoch": 19.39374185136897, "grad_norm": 1.0052913427352905, "learning_rate": 6.727509778357235e-05, "loss": 0.3353, "step": 238000 }, { "epoch": 19.434485006518905, "grad_norm": 0.7874491810798645, "learning_rate": 6.678617992177314e-05, "loss": 0.3378, "step": 238500 }, { "epoch": 19.475228161668838, "grad_norm": 0.8934744596481323, "learning_rate": 6.629726205997392e-05, "loss": 0.3461, "step": 239000 }, { "epoch": 19.515971316818774, "grad_norm": 0.8448681831359863, "learning_rate": 6.58083441981747e-05, "loss": 0.3327, "step": 239500 }, { "epoch": 19.55671447196871, "grad_norm": 0.9731137752532959, "learning_rate": 6.531942633637548e-05, "loss": 0.3474, "step": 240000 }, { "epoch": 19.597457627118644, "grad_norm": 1.2357642650604248, "learning_rate": 6.483050847457627e-05, "loss": 0.339, "step": 240500 }, { "epoch": 19.63820078226858, "grad_norm": 0.9630109667778015, "learning_rate": 6.434159061277704e-05, "loss": 0.3454, "step": 241000 }, { "epoch": 19.678943937418513, "grad_norm": 0.7776056528091431, "learning_rate": 6.385267275097784e-05, "loss": 0.3345, "step": 241500 }, { "epoch": 19.71968709256845, "grad_norm": 0.7007092237472534, "learning_rate": 6.336375488917862e-05, "loss": 0.341, "step": 242000 }, { "epoch": 19.760430247718382, "grad_norm": 1.361549735069275, "learning_rate": 6.28748370273794e-05, "loss": 0.3476, "step": 242500 }, { "epoch": 19.80117340286832, "grad_norm": 1.4261330366134644, "learning_rate": 6.238591916558018e-05, "loss": 0.3372, "step": 243000 }, { "epoch": 19.84191655801825, "grad_norm": 0.9126484394073486, "learning_rate": 6.189700130378096e-05, "loss": 0.346, "step": 243500 }, { "epoch": 19.882659713168188, "grad_norm": 0.7031682729721069, "learning_rate": 6.140808344198175e-05, "loss": 0.3333, "step": 244000 }, { "epoch": 19.92340286831812, "grad_norm": 0.7560476064682007, "learning_rate": 6.0919165580182524e-05, "loss": 0.3442, "step": 244500 }, { "epoch": 19.964146023468057, "grad_norm": 1.2263003587722778, "learning_rate": 6.0430247718383304e-05, "loss": 0.3425, "step": 245000 }, { "epoch": 20.0, "eval_accuracy": 0.828514039516449, "eval_loss": 0.4760776460170746, "eval_runtime": 4.3895, "eval_samples_per_second": 567.264, "eval_steps_per_second": 17.77, "step": 245440 }, { "epoch": 20.004889178617994, "grad_norm": 1.5437818765640259, "learning_rate": 5.994132985658409e-05, "loss": 0.348, "step": 245500 }, { "epoch": 20.045632333767927, "grad_norm": 0.6132621765136719, "learning_rate": 5.9452411994784865e-05, "loss": 0.3533, "step": 246000 }, { "epoch": 20.086375488917863, "grad_norm": 1.4345595836639404, "learning_rate": 5.896349413298565e-05, "loss": 0.3493, "step": 246500 }, { "epoch": 20.127118644067796, "grad_norm": 1.0400336980819702, "learning_rate": 5.847457627118644e-05, "loss": 0.3385, "step": 247000 }, { "epoch": 20.167861799217732, "grad_norm": 0.8435044884681702, "learning_rate": 5.798565840938721e-05, "loss": 0.3399, "step": 247500 }, { "epoch": 20.208604954367665, "grad_norm": 1.181225061416626, "learning_rate": 5.7496740547588e-05, "loss": 0.3384, "step": 248000 }, { "epoch": 20.2493481095176, "grad_norm": 0.7156890034675598, "learning_rate": 5.700782268578879e-05, "loss": 0.3428, "step": 248500 }, { "epoch": 20.290091264667534, "grad_norm": 0.9579421877861023, "learning_rate": 5.651890482398956e-05, "loss": 0.3527, "step": 249000 }, { "epoch": 20.33083441981747, "grad_norm": 0.6996088624000549, "learning_rate": 5.602998696219035e-05, "loss": 0.3392, "step": 249500 }, { "epoch": 20.371577574967404, "grad_norm": 0.6116717457771301, "learning_rate": 5.554106910039113e-05, "loss": 0.3443, "step": 250000 }, { "epoch": 20.41232073011734, "grad_norm": 0.6176652908325195, "learning_rate": 5.5052151238591916e-05, "loss": 0.3496, "step": 250500 }, { "epoch": 20.453063885267277, "grad_norm": 0.5394893288612366, "learning_rate": 5.456323337679269e-05, "loss": 0.3435, "step": 251000 }, { "epoch": 20.49380704041721, "grad_norm": 0.6180593371391296, "learning_rate": 5.4074315514993477e-05, "loss": 0.3468, "step": 251500 }, { "epoch": 20.534550195567146, "grad_norm": 1.2454131841659546, "learning_rate": 5.3585397653194264e-05, "loss": 0.3512, "step": 252000 }, { "epoch": 20.57529335071708, "grad_norm": 0.6435096263885498, "learning_rate": 5.309647979139504e-05, "loss": 0.3428, "step": 252500 }, { "epoch": 20.616036505867015, "grad_norm": 1.0173094272613525, "learning_rate": 5.2607561929595825e-05, "loss": 0.3498, "step": 253000 }, { "epoch": 20.656779661016948, "grad_norm": 0.8653329014778137, "learning_rate": 5.2118644067796605e-05, "loss": 0.3438, "step": 253500 }, { "epoch": 20.697522816166884, "grad_norm": 1.1969215869903564, "learning_rate": 5.1629726205997385e-05, "loss": 0.3328, "step": 254000 }, { "epoch": 20.738265971316817, "grad_norm": 2.1010959148406982, "learning_rate": 5.114080834419817e-05, "loss": 0.3419, "step": 254500 }, { "epoch": 20.779009126466754, "grad_norm": 1.4033147096633911, "learning_rate": 5.065189048239895e-05, "loss": 0.3414, "step": 255000 }, { "epoch": 20.819752281616687, "grad_norm": 1.1995563507080078, "learning_rate": 5.016297262059973e-05, "loss": 0.3458, "step": 255500 }, { "epoch": 20.860495436766623, "grad_norm": 0.9747222661972046, "learning_rate": 4.9674054758800514e-05, "loss": 0.3439, "step": 256000 }, { "epoch": 20.90123859191656, "grad_norm": 1.1838008165359497, "learning_rate": 4.91851368970013e-05, "loss": 0.3484, "step": 256500 }, { "epoch": 20.941981747066492, "grad_norm": 1.512540340423584, "learning_rate": 4.869621903520209e-05, "loss": 0.354, "step": 257000 }, { "epoch": 20.98272490221643, "grad_norm": 1.179273247718811, "learning_rate": 4.820730117340286e-05, "loss": 0.3366, "step": 257500 }, { "epoch": 21.0, "eval_accuracy": 0.8329316973686218, "eval_loss": 0.4599085748195648, "eval_runtime": 4.3978, "eval_samples_per_second": 566.196, "eval_steps_per_second": 17.736, "step": 257712 }, { "epoch": 21.02346805736636, "grad_norm": 0.6732698678970337, "learning_rate": 4.771838331160365e-05, "loss": 0.3419, "step": 258000 }, { "epoch": 21.064211212516298, "grad_norm": 1.0182501077651978, "learning_rate": 4.722946544980443e-05, "loss": 0.3404, "step": 258500 }, { "epoch": 21.10495436766623, "grad_norm": 0.9161713719367981, "learning_rate": 4.674054758800521e-05, "loss": 0.3434, "step": 259000 }, { "epoch": 21.145697522816167, "grad_norm": 0.8656306862831116, "learning_rate": 4.6251629726206e-05, "loss": 0.3486, "step": 259500 }, { "epoch": 21.1864406779661, "grad_norm": 0.8681694269180298, "learning_rate": 4.576271186440678e-05, "loss": 0.3451, "step": 260000 }, { "epoch": 21.227183833116037, "grad_norm": 1.7462722063064575, "learning_rate": 4.527379400260756e-05, "loss": 0.3495, "step": 260500 }, { "epoch": 21.26792698826597, "grad_norm": 0.8765119314193726, "learning_rate": 4.478487614080834e-05, "loss": 0.3494, "step": 261000 }, { "epoch": 21.308670143415906, "grad_norm": 0.9434525966644287, "learning_rate": 4.4295958279009125e-05, "loss": 0.3448, "step": 261500 }, { "epoch": 21.349413298565842, "grad_norm": 0.6454566717147827, "learning_rate": 4.38070404172099e-05, "loss": 0.3451, "step": 262000 }, { "epoch": 21.390156453715775, "grad_norm": 0.8628904223442078, "learning_rate": 4.3318122555410686e-05, "loss": 0.3419, "step": 262500 }, { "epoch": 21.43089960886571, "grad_norm": 1.01779043674469, "learning_rate": 4.282920469361147e-05, "loss": 0.3417, "step": 263000 }, { "epoch": 21.471642764015645, "grad_norm": 1.6835086345672607, "learning_rate": 4.2340286831812247e-05, "loss": 0.3318, "step": 263500 }, { "epoch": 21.51238591916558, "grad_norm": 1.0586639642715454, "learning_rate": 4.1851368970013034e-05, "loss": 0.3497, "step": 264000 }, { "epoch": 21.553129074315514, "grad_norm": 0.6478777527809143, "learning_rate": 4.136245110821382e-05, "loss": 0.3384, "step": 264500 }, { "epoch": 21.59387222946545, "grad_norm": 0.8856120705604553, "learning_rate": 4.08735332464146e-05, "loss": 0.3438, "step": 265000 }, { "epoch": 21.634615384615383, "grad_norm": 1.7250220775604248, "learning_rate": 4.038461538461538e-05, "loss": 0.3565, "step": 265500 }, { "epoch": 21.67535853976532, "grad_norm": 1.4531996250152588, "learning_rate": 3.989569752281616e-05, "loss": 0.3425, "step": 266000 }, { "epoch": 21.716101694915253, "grad_norm": 0.5768513083457947, "learning_rate": 3.940677966101695e-05, "loss": 0.3444, "step": 266500 }, { "epoch": 21.75684485006519, "grad_norm": 0.8255833387374878, "learning_rate": 3.891786179921772e-05, "loss": 0.3402, "step": 267000 }, { "epoch": 21.797588005215125, "grad_norm": 0.44311073422431946, "learning_rate": 3.842894393741851e-05, "loss": 0.3483, "step": 267500 }, { "epoch": 21.83833116036506, "grad_norm": 1.0727436542510986, "learning_rate": 3.79400260756193e-05, "loss": 0.3559, "step": 268000 }, { "epoch": 21.879074315514995, "grad_norm": 0.9774116277694702, "learning_rate": 3.745110821382008e-05, "loss": 0.3452, "step": 268500 }, { "epoch": 21.919817470664928, "grad_norm": 1.0499922037124634, "learning_rate": 3.696219035202086e-05, "loss": 0.3422, "step": 269000 }, { "epoch": 21.960560625814864, "grad_norm": 0.654230535030365, "learning_rate": 3.647327249022164e-05, "loss": 0.3475, "step": 269500 }, { "epoch": 22.0, "eval_accuracy": 0.8313252925872803, "eval_loss": 0.46137547492980957, "eval_runtime": 4.3722, "eval_samples_per_second": 569.511, "eval_steps_per_second": 17.84, "step": 269984 }, { "epoch": 22.001303780964797, "grad_norm": 0.731398344039917, "learning_rate": 3.5984354628422425e-05, "loss": 0.3507, "step": 270000 }, { "epoch": 22.042046936114733, "grad_norm": 0.9840994477272034, "learning_rate": 3.5495436766623206e-05, "loss": 0.3406, "step": 270500 }, { "epoch": 22.082790091264666, "grad_norm": 1.1506659984588623, "learning_rate": 3.5006518904823986e-05, "loss": 0.3541, "step": 271000 }, { "epoch": 22.123533246414603, "grad_norm": 1.3932029008865356, "learning_rate": 3.4517601043024767e-05, "loss": 0.3436, "step": 271500 }, { "epoch": 22.164276401564535, "grad_norm": 0.7392660975456238, "learning_rate": 3.402868318122555e-05, "loss": 0.3354, "step": 272000 }, { "epoch": 22.205019556714472, "grad_norm": 1.1278698444366455, "learning_rate": 3.3539765319426334e-05, "loss": 0.3475, "step": 272500 }, { "epoch": 22.24576271186441, "grad_norm": 0.7224228978157043, "learning_rate": 3.3050847457627114e-05, "loss": 0.3493, "step": 273000 }, { "epoch": 22.28650586701434, "grad_norm": 1.3191757202148438, "learning_rate": 3.25619295958279e-05, "loss": 0.3365, "step": 273500 }, { "epoch": 22.327249022164278, "grad_norm": 0.9900050163269043, "learning_rate": 3.207301173402868e-05, "loss": 0.3418, "step": 274000 }, { "epoch": 22.36799217731421, "grad_norm": 0.7188341617584229, "learning_rate": 3.158409387222946e-05, "loss": 0.3452, "step": 274500 }, { "epoch": 22.408735332464147, "grad_norm": 0.8764260411262512, "learning_rate": 3.109517601043025e-05, "loss": 0.3381, "step": 275000 }, { "epoch": 22.44947848761408, "grad_norm": 1.7185380458831787, "learning_rate": 3.060625814863103e-05, "loss": 0.3525, "step": 275500 }, { "epoch": 22.490221642764016, "grad_norm": 0.5640430450439453, "learning_rate": 3.0117340286831807e-05, "loss": 0.3385, "step": 276000 }, { "epoch": 22.53096479791395, "grad_norm": 0.8003877401351929, "learning_rate": 2.9628422425032594e-05, "loss": 0.3516, "step": 276500 }, { "epoch": 22.571707953063886, "grad_norm": 1.3197258710861206, "learning_rate": 2.9139504563233375e-05, "loss": 0.3412, "step": 277000 }, { "epoch": 22.61245110821382, "grad_norm": 0.677592933177948, "learning_rate": 2.8650586701434158e-05, "loss": 0.3538, "step": 277500 }, { "epoch": 22.653194263363755, "grad_norm": 0.78952956199646, "learning_rate": 2.816166883963494e-05, "loss": 0.3463, "step": 278000 }, { "epoch": 22.69393741851369, "grad_norm": 1.048988938331604, "learning_rate": 2.767275097783572e-05, "loss": 0.343, "step": 278500 }, { "epoch": 22.734680573663624, "grad_norm": 1.3893002271652222, "learning_rate": 2.7183833116036506e-05, "loss": 0.3434, "step": 279000 }, { "epoch": 22.77542372881356, "grad_norm": 1.320388674736023, "learning_rate": 2.6694915254237287e-05, "loss": 0.343, "step": 279500 }, { "epoch": 22.816166883963493, "grad_norm": 0.8476357460021973, "learning_rate": 2.6205997392438067e-05, "loss": 0.3458, "step": 280000 }, { "epoch": 22.85691003911343, "grad_norm": 1.417752742767334, "learning_rate": 2.571707953063885e-05, "loss": 0.3362, "step": 280500 }, { "epoch": 22.897653194263363, "grad_norm": 1.1953123807907104, "learning_rate": 2.522816166883963e-05, "loss": 0.3462, "step": 281000 }, { "epoch": 22.9383963494133, "grad_norm": 0.9179041385650635, "learning_rate": 2.473924380704042e-05, "loss": 0.3391, "step": 281500 }, { "epoch": 22.979139504563232, "grad_norm": 1.0501340627670288, "learning_rate": 2.42503259452412e-05, "loss": 0.3416, "step": 282000 }, { "epoch": 23.0, "eval_accuracy": 0.8317269086837769, "eval_loss": 0.4590187966823578, "eval_runtime": 4.3851, "eval_samples_per_second": 567.829, "eval_steps_per_second": 17.787, "step": 282256 }, { "epoch": 23.01988265971317, "grad_norm": 1.0656371116638184, "learning_rate": 2.376140808344198e-05, "loss": 0.3524, "step": 282500 }, { "epoch": 23.0606258148631, "grad_norm": 0.45340660214424133, "learning_rate": 2.3272490221642763e-05, "loss": 0.3457, "step": 283000 }, { "epoch": 23.101368970013038, "grad_norm": 1.008518099784851, "learning_rate": 2.2783572359843543e-05, "loss": 0.3462, "step": 283500 }, { "epoch": 23.142112125162974, "grad_norm": 1.1451541185379028, "learning_rate": 2.2294654498044324e-05, "loss": 0.3351, "step": 284000 }, { "epoch": 23.182855280312907, "grad_norm": 0.6879557371139526, "learning_rate": 2.180573663624511e-05, "loss": 0.3339, "step": 284500 }, { "epoch": 23.223598435462844, "grad_norm": 0.7767355442047119, "learning_rate": 2.131681877444589e-05, "loss": 0.3454, "step": 285000 }, { "epoch": 23.264341590612776, "grad_norm": 1.1457059383392334, "learning_rate": 2.0827900912646675e-05, "loss": 0.3452, "step": 285500 }, { "epoch": 23.305084745762713, "grad_norm": 1.3198171854019165, "learning_rate": 2.0338983050847455e-05, "loss": 0.3534, "step": 286000 }, { "epoch": 23.345827900912646, "grad_norm": 1.4695395231246948, "learning_rate": 1.9850065189048236e-05, "loss": 0.3395, "step": 286500 }, { "epoch": 23.386571056062582, "grad_norm": 0.7720468044281006, "learning_rate": 1.9361147327249023e-05, "loss": 0.3474, "step": 287000 }, { "epoch": 23.427314211212515, "grad_norm": 1.1282029151916504, "learning_rate": 1.8872229465449803e-05, "loss": 0.3418, "step": 287500 }, { "epoch": 23.46805736636245, "grad_norm": 1.3943537473678589, "learning_rate": 1.8383311603650584e-05, "loss": 0.3433, "step": 288000 }, { "epoch": 23.508800521512384, "grad_norm": 0.7310503721237183, "learning_rate": 1.7894393741851367e-05, "loss": 0.3491, "step": 288500 }, { "epoch": 23.54954367666232, "grad_norm": 1.075863242149353, "learning_rate": 1.740547588005215e-05, "loss": 0.3443, "step": 289000 }, { "epoch": 23.590286831812257, "grad_norm": 1.2456250190734863, "learning_rate": 1.691655801825293e-05, "loss": 0.3525, "step": 289500 }, { "epoch": 23.63102998696219, "grad_norm": 0.8498753905296326, "learning_rate": 1.6427640156453715e-05, "loss": 0.3394, "step": 290000 }, { "epoch": 23.671773142112126, "grad_norm": 1.3148057460784912, "learning_rate": 1.5938722294654496e-05, "loss": 0.3461, "step": 290500 }, { "epoch": 23.71251629726206, "grad_norm": 0.6420727372169495, "learning_rate": 1.544980443285528e-05, "loss": 0.3442, "step": 291000 }, { "epoch": 23.753259452411996, "grad_norm": 1.5686240196228027, "learning_rate": 1.496088657105606e-05, "loss": 0.3419, "step": 291500 }, { "epoch": 23.79400260756193, "grad_norm": 1.0205481052398682, "learning_rate": 1.4471968709256844e-05, "loss": 0.3461, "step": 292000 }, { "epoch": 23.834745762711865, "grad_norm": 1.3792383670806885, "learning_rate": 1.3983050847457626e-05, "loss": 0.3369, "step": 292500 }, { "epoch": 23.875488917861798, "grad_norm": 1.1198925971984863, "learning_rate": 1.349413298565841e-05, "loss": 0.3384, "step": 293000 }, { "epoch": 23.916232073011734, "grad_norm": 1.0253269672393799, "learning_rate": 1.300521512385919e-05, "loss": 0.3363, "step": 293500 }, { "epoch": 23.956975228161667, "grad_norm": 1.0401793718338013, "learning_rate": 1.2516297262059972e-05, "loss": 0.3404, "step": 294000 }, { "epoch": 23.997718383311604, "grad_norm": 0.9865344762802124, "learning_rate": 1.2027379400260756e-05, "loss": 0.3455, "step": 294500 }, { "epoch": 24.0, "eval_accuracy": 0.8305220603942871, "eval_loss": 0.4630131721496582, "eval_runtime": 4.38, "eval_samples_per_second": 568.498, "eval_steps_per_second": 17.808, "step": 294528 }, { "epoch": 24.03846153846154, "grad_norm": 0.9915386438369751, "learning_rate": 1.1538461538461538e-05, "loss": 0.3472, "step": 295000 }, { "epoch": 24.079204693611473, "grad_norm": 1.1903886795043945, "learning_rate": 1.1049543676662318e-05, "loss": 0.3403, "step": 295500 }, { "epoch": 24.11994784876141, "grad_norm": 0.7330212593078613, "learning_rate": 1.0560625814863102e-05, "loss": 0.352, "step": 296000 }, { "epoch": 24.160691003911342, "grad_norm": 1.2172068357467651, "learning_rate": 1.0071707953063884e-05, "loss": 0.3363, "step": 296500 }, { "epoch": 24.20143415906128, "grad_norm": 0.897605836391449, "learning_rate": 9.582790091264668e-06, "loss": 0.3448, "step": 297000 }, { "epoch": 24.24217731421121, "grad_norm": 0.7951999306678772, "learning_rate": 9.093872229465448e-06, "loss": 0.3467, "step": 297500 }, { "epoch": 24.282920469361148, "grad_norm": 1.3187936544418335, "learning_rate": 8.60495436766623e-06, "loss": 0.3406, "step": 298000 }, { "epoch": 24.32366362451108, "grad_norm": 0.8119403123855591, "learning_rate": 8.116036505867014e-06, "loss": 0.3573, "step": 298500 }, { "epoch": 24.364406779661017, "grad_norm": 0.6133550405502319, "learning_rate": 7.627118644067796e-06, "loss": 0.3468, "step": 299000 }, { "epoch": 24.40514993481095, "grad_norm": 0.7481356859207153, "learning_rate": 7.138200782268578e-06, "loss": 0.3518, "step": 299500 }, { "epoch": 24.445893089960887, "grad_norm": 1.0011976957321167, "learning_rate": 6.649282920469361e-06, "loss": 0.3358, "step": 300000 }, { "epoch": 24.486636245110823, "grad_norm": 0.472748726606369, "learning_rate": 6.160365058670142e-06, "loss": 0.3427, "step": 300500 }, { "epoch": 24.527379400260756, "grad_norm": 1.0231252908706665, "learning_rate": 5.671447196870925e-06, "loss": 0.351, "step": 301000 }, { "epoch": 24.568122555410692, "grad_norm": 0.954493522644043, "learning_rate": 5.182529335071707e-06, "loss": 0.337, "step": 301500 }, { "epoch": 24.608865710560625, "grad_norm": 2.0492947101593018, "learning_rate": 4.69361147327249e-06, "loss": 0.3387, "step": 302000 }, { "epoch": 24.64960886571056, "grad_norm": 0.9681317806243896, "learning_rate": 4.2046936114732716e-06, "loss": 0.3453, "step": 302500 }, { "epoch": 24.690352020860495, "grad_norm": 0.5848226547241211, "learning_rate": 3.715775749674054e-06, "loss": 0.3431, "step": 303000 }, { "epoch": 24.73109517601043, "grad_norm": 1.2530168294906616, "learning_rate": 3.2268578878748366e-06, "loss": 0.3336, "step": 303500 }, { "epoch": 24.771838331160364, "grad_norm": 1.0169261693954468, "learning_rate": 2.7379400260756187e-06, "loss": 0.3413, "step": 304000 }, { "epoch": 24.8125814863103, "grad_norm": 1.203783631324768, "learning_rate": 2.2490221642764016e-06, "loss": 0.3395, "step": 304500 }, { "epoch": 24.853324641460233, "grad_norm": 0.48721998929977417, "learning_rate": 1.7601043024771837e-06, "loss": 0.3454, "step": 305000 }, { "epoch": 24.89406779661017, "grad_norm": 1.793961763381958, "learning_rate": 1.271186440677966e-06, "loss": 0.3377, "step": 305500 }, { "epoch": 24.934810951760106, "grad_norm": 0.7789753079414368, "learning_rate": 7.822685788787482e-07, "loss": 0.3493, "step": 306000 }, { "epoch": 24.97555410691004, "grad_norm": 1.4958150386810303, "learning_rate": 2.9335071707953065e-07, "loss": 0.3399, "step": 306500 }, { "epoch": 25.0, "eval_accuracy": 0.8305220603942871, "eval_loss": 0.4618412256240845, "eval_runtime": 4.4048, "eval_samples_per_second": 565.297, "eval_steps_per_second": 17.708, "step": 306800 }, { "epoch": 25.0, "step": 306800, "total_flos": 0.0, "train_loss": 0.3379101407325874, "train_runtime": 22888.3145, "train_samples_per_second": 428.933, "train_steps_per_second": 13.404 } ], "logging_steps": 500, "max_steps": 306800, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }