{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 1.067886471748352, "learning_rate": 3.125e-06, "loss": 7.5946, "step": 1000 }, { "epoch": 0.11, "grad_norm": 0.733816385269165, "learning_rate": 6.25e-06, "loss": 5.8196, "step": 2000 }, { "epoch": 0.16, "grad_norm": 0.8074101209640503, "learning_rate": 9.375000000000001e-06, "loss": 5.3841, "step": 3000 }, { "epoch": 0.22, "grad_norm": 0.9885308146476746, "learning_rate": 1.25e-05, "loss": 5.1603, "step": 4000 }, { "epoch": 0.27, "grad_norm": 0.9510473012924194, "learning_rate": 1.5625e-05, "loss": 4.9991, "step": 5000 }, { "epoch": 0.32, "grad_norm": 1.0073457956314087, "learning_rate": 1.8750000000000002e-05, "loss": 4.8623, "step": 6000 }, { "epoch": 0.38, "grad_norm": 1.0554710626602173, "learning_rate": 2.1875e-05, "loss": 4.7445, "step": 7000 }, { "epoch": 0.43, "grad_norm": 1.413175106048584, "learning_rate": 2.5e-05, "loss": 4.6506, "step": 8000 }, { "epoch": 0.48, "grad_norm": 1.050113320350647, "learning_rate": 2.8125000000000003e-05, "loss": 4.558, "step": 9000 }, { "epoch": 0.54, "grad_norm": 1.084957242012024, "learning_rate": 3.125e-05, "loss": 4.4792, "step": 10000 }, { "epoch": 0.59, "grad_norm": 1.0409754514694214, "learning_rate": 3.4375e-05, "loss": 4.4159, "step": 11000 }, { "epoch": 0.65, "grad_norm": 1.05953049659729, "learning_rate": 3.7500000000000003e-05, "loss": 4.3503, "step": 12000 }, { "epoch": 0.7, "grad_norm": 1.0629490613937378, "learning_rate": 4.061875e-05, "loss": 4.2865, "step": 13000 }, { "epoch": 0.75, "grad_norm": 1.0375587940216064, "learning_rate": 4.374375e-05, "loss": 4.2368, "step": 14000 }, { "epoch": 0.81, "grad_norm": 1.0158554315567017, "learning_rate": 4.6865625e-05, "loss": 4.1872, "step": 15000 }, { "epoch": 0.86, "grad_norm": 1.0419200658798218, "learning_rate": 4.9990625000000004e-05, "loss": 4.1392, "step": 16000 }, { "epoch": 0.91, "grad_norm": 0.9611375331878662, "learning_rate": 5.3115625000000005e-05, "loss": 4.0979, "step": 17000 }, { "epoch": 0.97, "grad_norm": 1.0175549983978271, "learning_rate": 5.6240625e-05, "loss": 4.0574, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.3095065014422905, "eval_loss": 4.2623162269592285, "eval_runtime": 153.0183, "eval_samples_per_second": 378.523, "eval_steps_per_second": 5.921, "step": 18595 }, { "epoch": 1.02, "grad_norm": 0.9952818155288696, "learning_rate": 5.93625e-05, "loss": 4.0057, "step": 19000 }, { "epoch": 1.08, "grad_norm": 1.0475207567214966, "learning_rate": 6.24875e-05, "loss": 3.9654, "step": 20000 }, { "epoch": 1.13, "grad_norm": 0.987922728061676, "learning_rate": 6.56125e-05, "loss": 3.9248, "step": 21000 }, { "epoch": 1.18, "grad_norm": 1.002957820892334, "learning_rate": 6.8734375e-05, "loss": 3.8951, "step": 22000 }, { "epoch": 1.24, "grad_norm": 0.9509768486022949, "learning_rate": 7.185937500000001e-05, "loss": 3.8541, "step": 23000 }, { "epoch": 1.29, "grad_norm": 1.0254398584365845, "learning_rate": 7.498125e-05, "loss": 3.8288, "step": 24000 }, { "epoch": 1.34, "grad_norm": 0.9639053344726562, "learning_rate": 7.8103125e-05, "loss": 3.7996, "step": 25000 }, { "epoch": 1.4, "grad_norm": 0.9889234900474548, "learning_rate": 8.1228125e-05, "loss": 3.7701, "step": 26000 }, { "epoch": 1.45, "grad_norm": 0.9243676662445068, "learning_rate": 8.435e-05, "loss": 3.7486, "step": 27000 }, { "epoch": 1.51, "grad_norm": 0.8727757930755615, "learning_rate": 8.747500000000001e-05, "loss": 3.7347, "step": 28000 }, { "epoch": 1.56, "grad_norm": 0.8775372505187988, "learning_rate": 9.06e-05, "loss": 3.7105, "step": 29000 }, { "epoch": 1.61, "grad_norm": 0.929603099822998, "learning_rate": 9.3721875e-05, "loss": 3.6875, "step": 30000 }, { "epoch": 1.67, "grad_norm": 0.8385038375854492, "learning_rate": 9.6846875e-05, "loss": 3.6675, "step": 31000 }, { "epoch": 1.72, "grad_norm": 0.8377123475074768, "learning_rate": 9.996875e-05, "loss": 3.6537, "step": 32000 }, { "epoch": 1.77, "grad_norm": 0.861518919467926, "learning_rate": 9.970873786407767e-05, "loss": 3.6403, "step": 33000 }, { "epoch": 1.83, "grad_norm": 0.9055641293525696, "learning_rate": 9.941482789055604e-05, "loss": 3.6144, "step": 34000 }, { "epoch": 1.88, "grad_norm": 0.8535173535346985, "learning_rate": 9.912091791703443e-05, "loss": 3.6055, "step": 35000 }, { "epoch": 1.94, "grad_norm": 0.8203129768371582, "learning_rate": 9.882671373933511e-05, "loss": 3.5867, "step": 36000 }, { "epoch": 1.99, "grad_norm": 0.797301709651947, "learning_rate": 9.853250956163578e-05, "loss": 3.5744, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.3629589531481126, "eval_loss": 3.7410354614257812, "eval_runtime": 154.8386, "eval_samples_per_second": 374.073, "eval_steps_per_second": 5.851, "step": 37190 }, { "epoch": 2.04, "grad_norm": 0.8069159388542175, "learning_rate": 9.823830538393646e-05, "loss": 3.5289, "step": 38000 }, { "epoch": 2.1, "grad_norm": 0.8361462354660034, "learning_rate": 9.794468961459253e-05, "loss": 3.5205, "step": 39000 }, { "epoch": 2.15, "grad_norm": 0.8214879631996155, "learning_rate": 9.765048543689321e-05, "loss": 3.5072, "step": 40000 }, { "epoch": 2.2, "grad_norm": 0.7719361186027527, "learning_rate": 9.735628125919388e-05, "loss": 3.4949, "step": 41000 }, { "epoch": 2.26, "grad_norm": 0.7845537662506104, "learning_rate": 9.706237128567226e-05, "loss": 3.4928, "step": 42000 }, { "epoch": 2.31, "grad_norm": 0.7866695523262024, "learning_rate": 9.676816710797295e-05, "loss": 3.4841, "step": 43000 }, { "epoch": 2.37, "grad_norm": 0.7807812690734863, "learning_rate": 9.647396293027361e-05, "loss": 3.4723, "step": 44000 }, { "epoch": 2.42, "grad_norm": 0.7723334431648254, "learning_rate": 9.618005295675198e-05, "loss": 3.4662, "step": 45000 }, { "epoch": 2.47, "grad_norm": 0.7724099159240723, "learning_rate": 9.588584877905267e-05, "loss": 3.4576, "step": 46000 }, { "epoch": 2.53, "grad_norm": 0.810417890548706, "learning_rate": 9.559164460135335e-05, "loss": 3.4462, "step": 47000 }, { "epoch": 2.58, "grad_norm": 0.7635090351104736, "learning_rate": 9.529744042365401e-05, "loss": 3.4373, "step": 48000 }, { "epoch": 2.64, "grad_norm": 0.763451099395752, "learning_rate": 9.500382465431009e-05, "loss": 3.4328, "step": 49000 }, { "epoch": 2.69, "grad_norm": 0.8080945611000061, "learning_rate": 9.470962047661077e-05, "loss": 3.4229, "step": 50000 }, { "epoch": 2.74, "grad_norm": 0.7701908946037292, "learning_rate": 9.441571050308915e-05, "loss": 3.4231, "step": 51000 }, { "epoch": 2.8, "grad_norm": 0.7532150745391846, "learning_rate": 9.412150632538983e-05, "loss": 3.4128, "step": 52000 }, { "epoch": 2.85, "grad_norm": 0.7926775217056274, "learning_rate": 9.38273021476905e-05, "loss": 3.4068, "step": 53000 }, { "epoch": 2.9, "grad_norm": 0.7276414036750793, "learning_rate": 9.353339217416887e-05, "loss": 3.3964, "step": 54000 }, { "epoch": 2.96, "grad_norm": 0.7727640271186829, "learning_rate": 9.323918799646955e-05, "loss": 3.3998, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.37906783941658895, "eval_loss": 3.5873873233795166, "eval_runtime": 155.4366, "eval_samples_per_second": 372.634, "eval_steps_per_second": 5.829, "step": 55785 }, { "epoch": 3.01, "grad_norm": 0.7553613781929016, "learning_rate": 9.294498381877022e-05, "loss": 3.3754, "step": 56000 }, { "epoch": 3.07, "grad_norm": 0.7854211926460266, "learning_rate": 9.26507796410709e-05, "loss": 3.3398, "step": 57000 }, { "epoch": 3.12, "grad_norm": 0.7394266128540039, "learning_rate": 9.235686966754929e-05, "loss": 3.3396, "step": 58000 }, { "epoch": 3.17, "grad_norm": 0.7476953864097595, "learning_rate": 9.206295969402766e-05, "loss": 3.3336, "step": 59000 }, { "epoch": 3.23, "grad_norm": 0.7504665851593018, "learning_rate": 9.176875551632832e-05, "loss": 3.3347, "step": 60000 }, { "epoch": 3.28, "grad_norm": 0.8327361941337585, "learning_rate": 9.1474551338629e-05, "loss": 3.333, "step": 61000 }, { "epoch": 3.33, "grad_norm": 0.7361587882041931, "learning_rate": 9.118064136510739e-05, "loss": 3.3276, "step": 62000 }, { "epoch": 3.39, "grad_norm": 0.7490424513816833, "learning_rate": 9.088643718740807e-05, "loss": 3.326, "step": 63000 }, { "epoch": 3.44, "grad_norm": 0.7618813514709473, "learning_rate": 9.059223300970874e-05, "loss": 3.3236, "step": 64000 }, { "epoch": 3.5, "grad_norm": 0.7655198574066162, "learning_rate": 9.029832303618711e-05, "loss": 3.3182, "step": 65000 }, { "epoch": 3.55, "grad_norm": 0.7480560541152954, "learning_rate": 9.000411885848779e-05, "loss": 3.3156, "step": 66000 }, { "epoch": 3.6, "grad_norm": 0.7304962277412415, "learning_rate": 8.971020888496617e-05, "loss": 3.3119, "step": 67000 }, { "epoch": 3.66, "grad_norm": 0.7590723633766174, "learning_rate": 8.941600470726684e-05, "loss": 3.3114, "step": 68000 }, { "epoch": 3.71, "grad_norm": 0.7528940439224243, "learning_rate": 8.912209473374523e-05, "loss": 3.3107, "step": 69000 }, { "epoch": 3.76, "grad_norm": 0.7578818202018738, "learning_rate": 8.88278905560459e-05, "loss": 3.2993, "step": 70000 }, { "epoch": 3.82, "grad_norm": 0.7450219988822937, "learning_rate": 8.853398058252428e-05, "loss": 3.3001, "step": 71000 }, { "epoch": 3.87, "grad_norm": 0.7727616429328918, "learning_rate": 8.823977640482495e-05, "loss": 3.2959, "step": 72000 }, { "epoch": 3.93, "grad_norm": 0.746644914150238, "learning_rate": 8.794557222712563e-05, "loss": 3.2941, "step": 73000 }, { "epoch": 3.98, "grad_norm": 0.7255116105079651, "learning_rate": 8.765166225360401e-05, "loss": 3.2911, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.38734941333281203, "eval_loss": 3.5154592990875244, "eval_runtime": 154.0535, "eval_samples_per_second": 375.98, "eval_steps_per_second": 5.881, "step": 74380 }, { "epoch": 4.03, "grad_norm": 0.7433741092681885, "learning_rate": 8.735745807590468e-05, "loss": 3.254, "step": 75000 }, { "epoch": 4.09, "grad_norm": 0.7864168882369995, "learning_rate": 8.706325389820536e-05, "loss": 3.244, "step": 76000 }, { "epoch": 4.14, "grad_norm": 0.7510408759117126, "learning_rate": 8.676904972050603e-05, "loss": 3.234, "step": 77000 }, { "epoch": 4.19, "grad_norm": 0.7614234685897827, "learning_rate": 8.647513974698441e-05, "loss": 3.2402, "step": 78000 }, { "epoch": 4.25, "grad_norm": 0.6980342268943787, "learning_rate": 8.61809355692851e-05, "loss": 3.2404, "step": 79000 }, { "epoch": 4.3, "grad_norm": 0.7419264912605286, "learning_rate": 8.588673139158576e-05, "loss": 3.2367, "step": 80000 }, { "epoch": 4.36, "grad_norm": 0.7443615198135376, "learning_rate": 8.559282141806415e-05, "loss": 3.2396, "step": 81000 }, { "epoch": 4.41, "grad_norm": 0.7406558394432068, "learning_rate": 8.529861724036483e-05, "loss": 3.2379, "step": 82000 }, { "epoch": 4.46, "grad_norm": 0.7945263385772705, "learning_rate": 8.50044130626655e-05, "loss": 3.2354, "step": 83000 }, { "epoch": 4.52, "grad_norm": 0.7609983086585999, "learning_rate": 8.471020888496618e-05, "loss": 3.2354, "step": 84000 }, { "epoch": 4.57, "grad_norm": 0.7468125224113464, "learning_rate": 8.441688731979994e-05, "loss": 3.2372, "step": 85000 }, { "epoch": 4.62, "grad_norm": 0.7358154058456421, "learning_rate": 8.412268314210062e-05, "loss": 3.2329, "step": 86000 }, { "epoch": 4.68, "grad_norm": 0.7325968742370605, "learning_rate": 8.38284789644013e-05, "loss": 3.2331, "step": 87000 }, { "epoch": 4.73, "grad_norm": 0.7466831803321838, "learning_rate": 8.353427478670197e-05, "loss": 3.231, "step": 88000 }, { "epoch": 4.79, "grad_norm": 0.7216522693634033, "learning_rate": 8.324007060900265e-05, "loss": 3.2278, "step": 89000 }, { "epoch": 4.84, "grad_norm": 0.7092546820640564, "learning_rate": 8.294616063548103e-05, "loss": 3.2238, "step": 90000 }, { "epoch": 4.89, "grad_norm": 0.7463696599006653, "learning_rate": 8.26522506619594e-05, "loss": 3.2299, "step": 91000 }, { "epoch": 4.95, "grad_norm": 0.7190863490104675, "learning_rate": 8.235804648426007e-05, "loss": 3.2246, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.39185985043226396, "eval_loss": 3.4781594276428223, "eval_runtime": 155.9159, "eval_samples_per_second": 371.489, "eval_steps_per_second": 5.811, "step": 92975 }, { "epoch": 5.0, "grad_norm": 0.7185128331184387, "learning_rate": 8.206384230656075e-05, "loss": 3.2244, "step": 93000 }, { "epoch": 5.06, "grad_norm": 0.7451280951499939, "learning_rate": 8.176993233303914e-05, "loss": 3.1681, "step": 94000 }, { "epoch": 5.11, "grad_norm": 0.7803074717521667, "learning_rate": 8.147572815533982e-05, "loss": 3.1673, "step": 95000 }, { "epoch": 5.16, "grad_norm": 0.767760694026947, "learning_rate": 8.118152397764049e-05, "loss": 3.1734, "step": 96000 }, { "epoch": 5.22, "grad_norm": 0.7362794280052185, "learning_rate": 8.088731979994117e-05, "loss": 3.1718, "step": 97000 }, { "epoch": 5.27, "grad_norm": 0.7212786078453064, "learning_rate": 8.059311562224185e-05, "loss": 3.1739, "step": 98000 }, { "epoch": 5.32, "grad_norm": 0.7708650231361389, "learning_rate": 8.02992056487202e-05, "loss": 3.1771, "step": 99000 }, { "epoch": 5.38, "grad_norm": 0.7468061447143555, "learning_rate": 8.000500147102089e-05, "loss": 3.1772, "step": 100000 }, { "epoch": 5.43, "grad_norm": 0.7191327810287476, "learning_rate": 7.971079729332157e-05, "loss": 3.1756, "step": 101000 }, { "epoch": 5.49, "grad_norm": 0.7532328367233276, "learning_rate": 7.941659311562225e-05, "loss": 3.1763, "step": 102000 }, { "epoch": 5.54, "grad_norm": 0.8367959260940552, "learning_rate": 7.912238893792292e-05, "loss": 3.1821, "step": 103000 }, { "epoch": 5.59, "grad_norm": 0.7252469062805176, "learning_rate": 7.882877316857899e-05, "loss": 3.1748, "step": 104000 }, { "epoch": 5.65, "grad_norm": 0.7490761876106262, "learning_rate": 7.853456899087967e-05, "loss": 3.1699, "step": 105000 }, { "epoch": 5.7, "grad_norm": 0.7434331774711609, "learning_rate": 7.824065901735806e-05, "loss": 3.1779, "step": 106000 }, { "epoch": 5.75, "grad_norm": 0.7366090416908264, "learning_rate": 7.794645483965872e-05, "loss": 3.1804, "step": 107000 }, { "epoch": 5.81, "grad_norm": 0.7210941314697266, "learning_rate": 7.76522506619594e-05, "loss": 3.1764, "step": 108000 }, { "epoch": 5.86, "grad_norm": 0.7178290486335754, "learning_rate": 7.735834068843777e-05, "loss": 3.1733, "step": 109000 }, { "epoch": 5.92, "grad_norm": 0.7524459362030029, "learning_rate": 7.706413651073846e-05, "loss": 3.1741, "step": 110000 }, { "epoch": 5.97, "grad_norm": 0.8132150769233704, "learning_rate": 7.676993233303912e-05, "loss": 3.1723, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.3962266597257757, "eval_loss": 3.444021701812744, "eval_runtime": 154.4966, "eval_samples_per_second": 374.901, "eval_steps_per_second": 5.864, "step": 111570 }, { "epoch": 6.02, "grad_norm": 0.7259339690208435, "learning_rate": 7.64757281553398e-05, "loss": 3.1478, "step": 112000 }, { "epoch": 6.08, "grad_norm": 0.7428982853889465, "learning_rate": 7.618181818181819e-05, "loss": 3.1146, "step": 113000 }, { "epoch": 6.13, "grad_norm": 0.738284707069397, "learning_rate": 7.588790820829656e-05, "loss": 3.1139, "step": 114000 }, { "epoch": 6.18, "grad_norm": 0.7554523348808289, "learning_rate": 7.559370403059724e-05, "loss": 3.1289, "step": 115000 }, { "epoch": 6.24, "grad_norm": 0.7846411466598511, "learning_rate": 7.529949985289791e-05, "loss": 3.1262, "step": 116000 }, { "epoch": 6.29, "grad_norm": 0.7883805632591248, "learning_rate": 7.500558987937629e-05, "loss": 3.1223, "step": 117000 }, { "epoch": 6.35, "grad_norm": 0.7601564526557922, "learning_rate": 7.471138570167697e-05, "loss": 3.1278, "step": 118000 }, { "epoch": 6.4, "grad_norm": 0.7268677353858948, "learning_rate": 7.441718152397764e-05, "loss": 3.1267, "step": 119000 }, { "epoch": 6.45, "grad_norm": 0.7423516511917114, "learning_rate": 7.412297734627832e-05, "loss": 3.1345, "step": 120000 }, { "epoch": 6.51, "grad_norm": 0.7322275042533875, "learning_rate": 7.38290673727567e-05, "loss": 3.1299, "step": 121000 }, { "epoch": 6.56, "grad_norm": 0.7511858344078064, "learning_rate": 7.353515739923508e-05, "loss": 3.1309, "step": 122000 }, { "epoch": 6.61, "grad_norm": 0.7296398282051086, "learning_rate": 7.324124742571345e-05, "loss": 3.1366, "step": 123000 }, { "epoch": 6.67, "grad_norm": 0.7411390542984009, "learning_rate": 7.294704324801413e-05, "loss": 3.1333, "step": 124000 }, { "epoch": 6.72, "grad_norm": 0.7201886773109436, "learning_rate": 7.265283907031481e-05, "loss": 3.1341, "step": 125000 }, { "epoch": 6.78, "grad_norm": 0.727634847164154, "learning_rate": 7.235863489261548e-05, "loss": 3.1322, "step": 126000 }, { "epoch": 6.83, "grad_norm": 0.7570481300354004, "learning_rate": 7.206443071491616e-05, "loss": 3.1331, "step": 127000 }, { "epoch": 6.88, "grad_norm": 0.7223334908485413, "learning_rate": 7.177052074139453e-05, "loss": 3.1318, "step": 128000 }, { "epoch": 6.94, "grad_norm": 0.7305715680122375, "learning_rate": 7.147631656369521e-05, "loss": 3.1312, "step": 129000 }, { "epoch": 6.99, "grad_norm": 0.7004303932189941, "learning_rate": 7.118211238599588e-05, "loss": 3.1287, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.3987064226821223, "eval_loss": 3.427098512649536, "eval_runtime": 154.492, "eval_samples_per_second": 374.913, "eval_steps_per_second": 5.864, "step": 130165 }, { "epoch": 7.04, "grad_norm": 0.7484861612319946, "learning_rate": 7.088790820829656e-05, "loss": 3.0853, "step": 131000 }, { "epoch": 7.1, "grad_norm": 0.7987297773361206, "learning_rate": 7.059399823477494e-05, "loss": 3.0767, "step": 132000 }, { "epoch": 7.15, "grad_norm": 0.8101202845573425, "learning_rate": 7.029979405707561e-05, "loss": 3.0826, "step": 133000 }, { "epoch": 7.21, "grad_norm": 0.7572445869445801, "learning_rate": 7.000558987937629e-05, "loss": 3.0805, "step": 134000 }, { "epoch": 7.26, "grad_norm": 0.7851737141609192, "learning_rate": 6.971167990585466e-05, "loss": 3.0868, "step": 135000 }, { "epoch": 7.31, "grad_norm": 0.7266308069229126, "learning_rate": 6.941747572815534e-05, "loss": 3.0907, "step": 136000 }, { "epoch": 7.37, "grad_norm": 0.7868114709854126, "learning_rate": 6.912327155045601e-05, "loss": 3.09, "step": 137000 }, { "epoch": 7.42, "grad_norm": 0.7553048729896545, "learning_rate": 6.882965578111209e-05, "loss": 3.0894, "step": 138000 }, { "epoch": 7.48, "grad_norm": 0.7621979117393494, "learning_rate": 6.853545160341277e-05, "loss": 3.0981, "step": 139000 }, { "epoch": 7.53, "grad_norm": 0.7651191353797913, "learning_rate": 6.824124742571345e-05, "loss": 3.0907, "step": 140000 }, { "epoch": 7.58, "grad_norm": 0.7562927007675171, "learning_rate": 6.794704324801413e-05, "loss": 3.0916, "step": 141000 }, { "epoch": 7.64, "grad_norm": 0.7490217685699463, "learning_rate": 6.76528390703148e-05, "loss": 3.0941, "step": 142000 }, { "epoch": 7.69, "grad_norm": 0.7593701481819153, "learning_rate": 6.735892909679318e-05, "loss": 3.0938, "step": 143000 }, { "epoch": 7.74, "grad_norm": 0.7405507564544678, "learning_rate": 6.706472491909386e-05, "loss": 3.0936, "step": 144000 }, { "epoch": 7.8, "grad_norm": 0.7147114276885986, "learning_rate": 6.677081494557223e-05, "loss": 3.0941, "step": 145000 }, { "epoch": 7.85, "grad_norm": 0.7247936129570007, "learning_rate": 6.64766107678729e-05, "loss": 3.0956, "step": 146000 }, { "epoch": 7.91, "grad_norm": 0.7443585395812988, "learning_rate": 6.618240659017358e-05, "loss": 3.094, "step": 147000 }, { "epoch": 7.96, "grad_norm": 0.7586407661437988, "learning_rate": 6.588849661665197e-05, "loss": 3.0994, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.4007113002501663, "eval_loss": 3.398989677429199, "eval_runtime": 154.5692, "eval_samples_per_second": 374.725, "eval_steps_per_second": 5.861, "step": 148760 }, { "epoch": 8.01, "grad_norm": 0.7934736013412476, "learning_rate": 6.559429243895263e-05, "loss": 3.078, "step": 149000 }, { "epoch": 8.07, "grad_norm": 0.7650701403617859, "learning_rate": 6.530008826125332e-05, "loss": 3.0392, "step": 150000 }, { "epoch": 8.12, "grad_norm": 0.7813672423362732, "learning_rate": 6.5005884083554e-05, "loss": 3.0466, "step": 151000 }, { "epoch": 8.17, "grad_norm": 0.7664616107940674, "learning_rate": 6.471167990585466e-05, "loss": 3.0515, "step": 152000 }, { "epoch": 8.23, "grad_norm": 0.7861508727073669, "learning_rate": 6.441776993233303e-05, "loss": 3.0481, "step": 153000 }, { "epoch": 8.28, "grad_norm": 0.7521619200706482, "learning_rate": 6.412385995881142e-05, "loss": 3.0537, "step": 154000 }, { "epoch": 8.34, "grad_norm": 0.7624778151512146, "learning_rate": 6.38296557811121e-05, "loss": 3.0574, "step": 155000 }, { "epoch": 8.39, "grad_norm": 0.7616772651672363, "learning_rate": 6.353545160341278e-05, "loss": 3.055, "step": 156000 }, { "epoch": 8.44, "grad_norm": 0.7683927416801453, "learning_rate": 6.324124742571345e-05, "loss": 3.0581, "step": 157000 }, { "epoch": 8.5, "grad_norm": 0.7723093628883362, "learning_rate": 6.294733745219182e-05, "loss": 3.0604, "step": 158000 }, { "epoch": 8.55, "grad_norm": 0.7625917792320251, "learning_rate": 6.26534274786702e-05, "loss": 3.0581, "step": 159000 }, { "epoch": 8.6, "grad_norm": 0.7637842297554016, "learning_rate": 6.235922330097088e-05, "loss": 3.0607, "step": 160000 }, { "epoch": 8.66, "grad_norm": 0.7364535927772522, "learning_rate": 6.206501912327155e-05, "loss": 3.0628, "step": 161000 }, { "epoch": 8.71, "grad_norm": 0.7330908179283142, "learning_rate": 6.177081494557223e-05, "loss": 3.0622, "step": 162000 }, { "epoch": 8.77, "grad_norm": 0.7949569821357727, "learning_rate": 6.14769049720506e-05, "loss": 3.0639, "step": 163000 }, { "epoch": 8.82, "grad_norm": 0.7422482967376709, "learning_rate": 6.118270079435127e-05, "loss": 3.0639, "step": 164000 }, { "epoch": 8.87, "grad_norm": 0.7859163284301758, "learning_rate": 6.088849661665196e-05, "loss": 3.0644, "step": 165000 }, { "epoch": 8.93, "grad_norm": 0.7574995756149292, "learning_rate": 6.059458664313034e-05, "loss": 3.0676, "step": 166000 }, { "epoch": 8.98, "grad_norm": 0.7532913684844971, "learning_rate": 6.030038246543102e-05, "loss": 3.0668, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.40181262590525657, "eval_loss": 3.4112191200256348, "eval_runtime": 154.3029, "eval_samples_per_second": 375.372, "eval_steps_per_second": 5.872, "step": 167355 }, { "epoch": 9.03, "grad_norm": 0.7874590158462524, "learning_rate": 6.0006178287731686e-05, "loss": 3.0282, "step": 168000 }, { "epoch": 9.09, "grad_norm": 0.7673851847648621, "learning_rate": 5.9712268314210063e-05, "loss": 3.0102, "step": 169000 }, { "epoch": 9.14, "grad_norm": 0.7693243026733398, "learning_rate": 5.9418064136510745e-05, "loss": 3.0168, "step": 170000 }, { "epoch": 9.2, "grad_norm": 0.7992390990257263, "learning_rate": 5.9123859958811426e-05, "loss": 3.0208, "step": 171000 }, { "epoch": 9.25, "grad_norm": 0.8022863864898682, "learning_rate": 5.8829655781112094e-05, "loss": 3.0259, "step": 172000 }, { "epoch": 9.3, "grad_norm": 0.7715474367141724, "learning_rate": 5.853574580759047e-05, "loss": 3.0219, "step": 173000 }, { "epoch": 9.36, "grad_norm": 0.7709059715270996, "learning_rate": 5.824183583406885e-05, "loss": 3.0224, "step": 174000 }, { "epoch": 9.41, "grad_norm": 0.7430821657180786, "learning_rate": 5.794763165636953e-05, "loss": 3.031, "step": 175000 }, { "epoch": 9.46, "grad_norm": 0.7953349351882935, "learning_rate": 5.76534274786702e-05, "loss": 3.0324, "step": 176000 }, { "epoch": 9.52, "grad_norm": 0.7622591853141785, "learning_rate": 5.735922330097088e-05, "loss": 3.0324, "step": 177000 }, { "epoch": 9.57, "grad_norm": 0.7647625207901001, "learning_rate": 5.7065313327449256e-05, "loss": 3.0297, "step": 178000 }, { "epoch": 9.63, "grad_norm": 0.7950288653373718, "learning_rate": 5.677110914974992e-05, "loss": 3.0326, "step": 179000 }, { "epoch": 9.68, "grad_norm": 0.7566654682159424, "learning_rate": 5.6476904972050605e-05, "loss": 3.0352, "step": 180000 }, { "epoch": 9.73, "grad_norm": 0.7347955107688904, "learning_rate": 5.618299499852898e-05, "loss": 3.0393, "step": 181000 }, { "epoch": 9.79, "grad_norm": 0.7836588621139526, "learning_rate": 5.588879082082966e-05, "loss": 3.0357, "step": 182000 }, { "epoch": 9.84, "grad_norm": 0.7765457630157471, "learning_rate": 5.559458664313033e-05, "loss": 3.0367, "step": 183000 }, { "epoch": 9.9, "grad_norm": 0.787699818611145, "learning_rate": 5.530038246543101e-05, "loss": 3.0396, "step": 184000 }, { "epoch": 9.95, "grad_norm": 0.7399518489837646, "learning_rate": 5.5006766696087085e-05, "loss": 3.0398, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.4033287792973349, "eval_loss": 3.3915066719055176, "eval_runtime": 155.1071, "eval_samples_per_second": 373.426, "eval_steps_per_second": 5.841, "step": 185950 }, { "epoch": 10.0, "grad_norm": 0.7820441126823425, "learning_rate": 5.471256251838777e-05, "loss": 3.0358, "step": 186000 }, { "epoch": 10.06, "grad_norm": 0.7809981107711792, "learning_rate": 5.4418358340688434e-05, "loss": 2.9828, "step": 187000 }, { "epoch": 10.11, "grad_norm": 0.7865327000617981, "learning_rate": 5.4124154162989116e-05, "loss": 2.9846, "step": 188000 }, { "epoch": 10.16, "grad_norm": 0.7754176259040833, "learning_rate": 5.38299499852898e-05, "loss": 2.9876, "step": 189000 }, { "epoch": 10.22, "grad_norm": 0.7762061953544617, "learning_rate": 5.3536040011768174e-05, "loss": 2.9944, "step": 190000 }, { "epoch": 10.27, "grad_norm": 0.805346667766571, "learning_rate": 5.324183583406884e-05, "loss": 2.9967, "step": 191000 }, { "epoch": 10.33, "grad_norm": 0.7976134419441223, "learning_rate": 5.2948220064724915e-05, "loss": 3.0013, "step": 192000 }, { "epoch": 10.38, "grad_norm": 0.7839481830596924, "learning_rate": 5.2654015887025596e-05, "loss": 3.002, "step": 193000 }, { "epoch": 10.43, "grad_norm": 0.7919566631317139, "learning_rate": 5.235981170932628e-05, "loss": 3.0033, "step": 194000 }, { "epoch": 10.49, "grad_norm": 0.7859071493148804, "learning_rate": 5.2065901735804655e-05, "loss": 3.0082, "step": 195000 }, { "epoch": 10.54, "grad_norm": 0.7803195118904114, "learning_rate": 5.177169755810532e-05, "loss": 3.0073, "step": 196000 }, { "epoch": 10.59, "grad_norm": 0.7861395478248596, "learning_rate": 5.14777875845837e-05, "loss": 3.0101, "step": 197000 }, { "epoch": 10.65, "grad_norm": 0.7760633230209351, "learning_rate": 5.118358340688438e-05, "loss": 3.0098, "step": 198000 }, { "epoch": 10.7, "grad_norm": 0.7999356985092163, "learning_rate": 5.0889379229185056e-05, "loss": 3.0116, "step": 199000 }, { "epoch": 10.76, "grad_norm": 0.8239379525184631, "learning_rate": 5.059546925566343e-05, "loss": 3.0126, "step": 200000 }, { "epoch": 10.81, "grad_norm": 0.7515137791633606, "learning_rate": 5.030126507796411e-05, "loss": 3.013, "step": 201000 }, { "epoch": 10.86, "grad_norm": 0.7992883920669556, "learning_rate": 5.000735510444249e-05, "loss": 3.0113, "step": 202000 }, { "epoch": 10.92, "grad_norm": 0.8197943568229675, "learning_rate": 4.9713150926743166e-05, "loss": 3.0124, "step": 203000 }, { "epoch": 10.97, "grad_norm": 0.8052383065223694, "learning_rate": 4.941924095322154e-05, "loss": 3.0097, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.40369517127750215, "eval_loss": 3.4066834449768066, "eval_runtime": 154.5201, "eval_samples_per_second": 374.844, "eval_steps_per_second": 5.863, "step": 204545 }, { "epoch": 11.02, "grad_norm": 0.7899711728096008, "learning_rate": 4.912503677552222e-05, "loss": 2.9904, "step": 205000 }, { "epoch": 11.08, "grad_norm": 0.825548529624939, "learning_rate": 4.883083259782289e-05, "loss": 2.9576, "step": 206000 }, { "epoch": 11.13, "grad_norm": 0.7997110486030579, "learning_rate": 4.853662842012357e-05, "loss": 2.9651, "step": 207000 }, { "epoch": 11.19, "grad_norm": 0.8206402063369751, "learning_rate": 4.824242424242425e-05, "loss": 2.9689, "step": 208000 }, { "epoch": 11.24, "grad_norm": 0.8368428945541382, "learning_rate": 4.794822006472492e-05, "loss": 2.9713, "step": 209000 }, { "epoch": 11.29, "grad_norm": 0.807217001914978, "learning_rate": 4.76543100912033e-05, "loss": 2.977, "step": 210000 }, { "epoch": 11.35, "grad_norm": 0.8276655673980713, "learning_rate": 4.7360105913503974e-05, "loss": 2.9769, "step": 211000 }, { "epoch": 11.4, "grad_norm": 0.8292602896690369, "learning_rate": 4.706619593998235e-05, "loss": 2.9825, "step": 212000 }, { "epoch": 11.45, "grad_norm": 0.8385449647903442, "learning_rate": 4.6771991762283026e-05, "loss": 2.978, "step": 213000 }, { "epoch": 11.51, "grad_norm": 0.8204603791236877, "learning_rate": 4.64780817887614e-05, "loss": 2.9859, "step": 214000 }, { "epoch": 11.56, "grad_norm": 0.8087953329086304, "learning_rate": 4.6183877611062084e-05, "loss": 2.9844, "step": 215000 }, { "epoch": 11.62, "grad_norm": 0.8039466142654419, "learning_rate": 4.588967343336276e-05, "loss": 2.9838, "step": 216000 }, { "epoch": 11.67, "grad_norm": 0.822203516960144, "learning_rate": 4.559546925566343e-05, "loss": 2.9861, "step": 217000 }, { "epoch": 11.72, "grad_norm": 0.8148356080055237, "learning_rate": 4.530155928214181e-05, "loss": 2.9891, "step": 218000 }, { "epoch": 11.78, "grad_norm": 0.834892988204956, "learning_rate": 4.5007355104442485e-05, "loss": 2.9857, "step": 219000 }, { "epoch": 11.83, "grad_norm": 0.8096596598625183, "learning_rate": 4.4713150926743166e-05, "loss": 2.9922, "step": 220000 }, { "epoch": 11.88, "grad_norm": 0.8051721453666687, "learning_rate": 4.441894674904384e-05, "loss": 2.9882, "step": 221000 }, { "epoch": 11.94, "grad_norm": 0.8202148675918579, "learning_rate": 4.412503677552222e-05, "loss": 2.9942, "step": 222000 }, { "epoch": 11.99, "grad_norm": 0.8127633929252625, "learning_rate": 4.383083259782289e-05, "loss": 2.9924, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.40389375116260934, "eval_loss": 3.411677598953247, "eval_runtime": 154.9294, "eval_samples_per_second": 373.854, "eval_steps_per_second": 5.848, "step": 223140 }, { "epoch": 12.05, "grad_norm": 0.8064407110214233, "learning_rate": 4.353692262430127e-05, "loss": 2.9498, "step": 224000 }, { "epoch": 12.1, "grad_norm": 0.8146172165870667, "learning_rate": 4.324301265077964e-05, "loss": 2.9449, "step": 225000 }, { "epoch": 12.15, "grad_norm": 0.8390582203865051, "learning_rate": 4.294880847308032e-05, "loss": 2.9472, "step": 226000 }, { "epoch": 12.21, "grad_norm": 0.8286164402961731, "learning_rate": 4.2654604295380996e-05, "loss": 2.9479, "step": 227000 }, { "epoch": 12.26, "grad_norm": 0.8598948121070862, "learning_rate": 4.2360694321859373e-05, "loss": 2.9519, "step": 228000 }, { "epoch": 12.32, "grad_norm": 0.8387620449066162, "learning_rate": 4.206649014416005e-05, "loss": 2.9544, "step": 229000 }, { "epoch": 12.37, "grad_norm": 0.8272960186004639, "learning_rate": 4.1772580170638425e-05, "loss": 2.9589, "step": 230000 }, { "epoch": 12.42, "grad_norm": 0.8225711584091187, "learning_rate": 4.14783759929391e-05, "loss": 2.9627, "step": 231000 }, { "epoch": 12.48, "grad_norm": 0.8449904918670654, "learning_rate": 4.118417181523978e-05, "loss": 2.9605, "step": 232000 }, { "epoch": 12.53, "grad_norm": 0.8118672966957092, "learning_rate": 4.0889967637540455e-05, "loss": 2.9599, "step": 233000 }, { "epoch": 12.58, "grad_norm": 0.8042457699775696, "learning_rate": 4.059605766401883e-05, "loss": 2.9606, "step": 234000 }, { "epoch": 12.64, "grad_norm": 0.8121609091758728, "learning_rate": 4.030185348631951e-05, "loss": 2.9625, "step": 235000 }, { "epoch": 12.69, "grad_norm": 0.8004827499389648, "learning_rate": 4.000764930862018e-05, "loss": 2.9615, "step": 236000 }, { "epoch": 12.75, "grad_norm": 0.8061490654945374, "learning_rate": 3.971344513092086e-05, "loss": 2.9628, "step": 237000 }, { "epoch": 12.8, "grad_norm": 0.8228151202201843, "learning_rate": 3.941953515739923e-05, "loss": 2.9676, "step": 238000 }, { "epoch": 12.85, "grad_norm": 0.8485589623451233, "learning_rate": 3.912562518387761e-05, "loss": 2.9685, "step": 239000 }, { "epoch": 12.91, "grad_norm": 0.8636566400527954, "learning_rate": 3.8831421006178285e-05, "loss": 2.9675, "step": 240000 }, { "epoch": 12.96, "grad_norm": 0.8194534182548523, "learning_rate": 3.8537216828478966e-05, "loss": 2.9702, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.40538538437266325, "eval_loss": 3.392601728439331, "eval_runtime": 154.6837, "eval_samples_per_second": 374.448, "eval_steps_per_second": 5.857, "step": 241735 }, { "epoch": 13.01, "grad_norm": 0.8235858082771301, "learning_rate": 3.8243306854957344e-05, "loss": 2.9554, "step": 242000 }, { "epoch": 13.07, "grad_norm": 0.8525816798210144, "learning_rate": 3.794910267725802e-05, "loss": 2.9226, "step": 243000 }, { "epoch": 13.12, "grad_norm": 0.8664265871047974, "learning_rate": 3.765489849955869e-05, "loss": 2.9269, "step": 244000 }, { "epoch": 13.18, "grad_norm": 0.816015362739563, "learning_rate": 3.736098852603707e-05, "loss": 2.9324, "step": 245000 }, { "epoch": 13.23, "grad_norm": 0.8406867980957031, "learning_rate": 3.7066784348337744e-05, "loss": 2.9303, "step": 246000 }, { "epoch": 13.28, "grad_norm": 0.8618828058242798, "learning_rate": 3.6772580170638426e-05, "loss": 2.9388, "step": 247000 }, { "epoch": 13.34, "grad_norm": 0.8584656715393066, "learning_rate": 3.64783759929391e-05, "loss": 2.9345, "step": 248000 }, { "epoch": 13.39, "grad_norm": 0.8428023457527161, "learning_rate": 3.618446601941748e-05, "loss": 2.9364, "step": 249000 }, { "epoch": 13.44, "grad_norm": 0.8423276543617249, "learning_rate": 3.589055604589585e-05, "loss": 2.9406, "step": 250000 }, { "epoch": 13.5, "grad_norm": 0.8599599003791809, "learning_rate": 3.559635186819653e-05, "loss": 2.9417, "step": 251000 }, { "epoch": 13.55, "grad_norm": 0.859804630279541, "learning_rate": 3.5302147690497204e-05, "loss": 2.9413, "step": 252000 }, { "epoch": 13.61, "grad_norm": 0.8606885671615601, "learning_rate": 3.500823771697558e-05, "loss": 2.9451, "step": 253000 }, { "epoch": 13.66, "grad_norm": 0.8308110237121582, "learning_rate": 3.4714033539276255e-05, "loss": 2.9439, "step": 254000 }, { "epoch": 13.71, "grad_norm": 0.8358808159828186, "learning_rate": 3.441982936157694e-05, "loss": 2.9456, "step": 255000 }, { "epoch": 13.77, "grad_norm": 0.7978032827377319, "learning_rate": 3.4125919388055314e-05, "loss": 2.9455, "step": 256000 }, { "epoch": 13.82, "grad_norm": 0.8252397179603577, "learning_rate": 3.383171521035599e-05, "loss": 2.9427, "step": 257000 }, { "epoch": 13.87, "grad_norm": 0.8591821789741516, "learning_rate": 3.3537805236834366e-05, "loss": 2.946, "step": 258000 }, { "epoch": 13.93, "grad_norm": 0.8396506309509277, "learning_rate": 3.324360105913504e-05, "loss": 2.9508, "step": 259000 }, { "epoch": 13.98, "grad_norm": 0.8087401986122131, "learning_rate": 3.2949396881435715e-05, "loss": 2.9486, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.4053156530057007, "eval_loss": 3.403454065322876, "eval_runtime": 154.7962, "eval_samples_per_second": 374.176, "eval_steps_per_second": 5.853, "step": 260330 }, { "epoch": 14.04, "grad_norm": 0.8584620952606201, "learning_rate": 3.26554869079141e-05, "loss": 2.9161, "step": 261000 }, { "epoch": 14.09, "grad_norm": 0.8663614988327026, "learning_rate": 3.236128273021477e-05, "loss": 2.9065, "step": 262000 }, { "epoch": 14.14, "grad_norm": 0.8541947603225708, "learning_rate": 3.206737275669315e-05, "loss": 2.9096, "step": 263000 }, { "epoch": 14.2, "grad_norm": 0.8692386746406555, "learning_rate": 3.1773168578993825e-05, "loss": 2.9083, "step": 264000 }, { "epoch": 14.25, "grad_norm": 0.8886841535568237, "learning_rate": 3.14789644012945e-05, "loss": 2.9165, "step": 265000 }, { "epoch": 14.3, "grad_norm": 0.812725305557251, "learning_rate": 3.118476022359518e-05, "loss": 2.9143, "step": 266000 }, { "epoch": 14.36, "grad_norm": 0.869747519493103, "learning_rate": 3.089085025007356e-05, "loss": 2.9188, "step": 267000 }, { "epoch": 14.41, "grad_norm": 0.8489671349525452, "learning_rate": 3.059664607237423e-05, "loss": 2.9206, "step": 268000 }, { "epoch": 14.47, "grad_norm": 0.8796459436416626, "learning_rate": 3.0302441894674903e-05, "loss": 2.9219, "step": 269000 }, { "epoch": 14.52, "grad_norm": 0.8939893841743469, "learning_rate": 3.0008237716975585e-05, "loss": 2.9264, "step": 270000 }, { "epoch": 14.57, "grad_norm": 0.8177582621574402, "learning_rate": 2.9714327743453955e-05, "loss": 2.9238, "step": 271000 }, { "epoch": 14.63, "grad_norm": 0.8382042646408081, "learning_rate": 2.9420123565754636e-05, "loss": 2.925, "step": 272000 }, { "epoch": 14.68, "grad_norm": 0.8616077303886414, "learning_rate": 2.912621359223301e-05, "loss": 2.927, "step": 273000 }, { "epoch": 14.74, "grad_norm": 0.8544275164604187, "learning_rate": 2.8832009414533688e-05, "loss": 2.9285, "step": 274000 }, { "epoch": 14.79, "grad_norm": 0.8977128267288208, "learning_rate": 2.8538099441012062e-05, "loss": 2.9256, "step": 275000 }, { "epoch": 14.84, "grad_norm": 0.8525545597076416, "learning_rate": 2.8243895263312743e-05, "loss": 2.9269, "step": 276000 }, { "epoch": 14.9, "grad_norm": 0.8997774124145508, "learning_rate": 2.7949691085613418e-05, "loss": 2.9319, "step": 277000 }, { "epoch": 14.95, "grad_norm": 0.8549333214759827, "learning_rate": 2.7655486907914092e-05, "loss": 2.9284, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.40559088365160195, "eval_loss": 3.4107415676116943, "eval_runtime": 154.2194, "eval_samples_per_second": 375.575, "eval_steps_per_second": 5.875, "step": 278925 }, { "epoch": 15.0, "grad_norm": 0.8630433082580566, "learning_rate": 2.736157693439247e-05, "loss": 2.925, "step": 279000 }, { "epoch": 15.06, "grad_norm": 0.8847051858901978, "learning_rate": 2.7067666960870847e-05, "loss": 2.8897, "step": 280000 }, { "epoch": 15.11, "grad_norm": 0.9147212505340576, "learning_rate": 2.677346278317152e-05, "loss": 2.89, "step": 281000 }, { "epoch": 15.17, "grad_norm": 0.8927858471870422, "learning_rate": 2.6479258605472203e-05, "loss": 2.8969, "step": 282000 }, { "epoch": 15.22, "grad_norm": 0.9007779955863953, "learning_rate": 2.6185054427772877e-05, "loss": 2.8942, "step": 283000 }, { "epoch": 15.27, "grad_norm": 0.9066419005393982, "learning_rate": 2.5891144454251254e-05, "loss": 2.8991, "step": 284000 }, { "epoch": 15.33, "grad_norm": 0.8653886318206787, "learning_rate": 2.5597234480729625e-05, "loss": 2.8948, "step": 285000 }, { "epoch": 15.38, "grad_norm": 0.8762679696083069, "learning_rate": 2.5303030303030306e-05, "loss": 2.9002, "step": 286000 }, { "epoch": 15.43, "grad_norm": 0.8709083199501038, "learning_rate": 2.500882612533098e-05, "loss": 2.9028, "step": 287000 }, { "epoch": 15.49, "grad_norm": 0.8785495758056641, "learning_rate": 2.4714916151809355e-05, "loss": 2.9096, "step": 288000 }, { "epoch": 15.54, "grad_norm": 0.86436927318573, "learning_rate": 2.4420711974110032e-05, "loss": 2.9034, "step": 289000 }, { "epoch": 15.6, "grad_norm": 0.9002882838249207, "learning_rate": 2.412650779641071e-05, "loss": 2.9057, "step": 290000 }, { "epoch": 15.65, "grad_norm": 0.9001891016960144, "learning_rate": 2.3832597822889084e-05, "loss": 2.9091, "step": 291000 }, { "epoch": 15.7, "grad_norm": 0.8884735107421875, "learning_rate": 2.3538393645189762e-05, "loss": 2.9057, "step": 292000 }, { "epoch": 15.76, "grad_norm": 0.9057520031929016, "learning_rate": 2.324418946749044e-05, "loss": 2.9103, "step": 293000 }, { "epoch": 15.81, "grad_norm": 0.868931233882904, "learning_rate": 2.2950279493968814e-05, "loss": 2.9129, "step": 294000 }, { "epoch": 15.86, "grad_norm": 0.8856936693191528, "learning_rate": 2.265607531626949e-05, "loss": 2.9105, "step": 295000 }, { "epoch": 15.92, "grad_norm": 0.8503423929214478, "learning_rate": 2.236187113857017e-05, "loss": 2.9097, "step": 296000 }, { "epoch": 15.97, "grad_norm": 0.8861287832260132, "learning_rate": 2.2067666960870844e-05, "loss": 2.9143, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.406079406291825, "eval_loss": 3.4056761264801025, "eval_runtime": 154.0737, "eval_samples_per_second": 375.931, "eval_steps_per_second": 5.88, "step": 297520 }, { "epoch": 16.03, "grad_norm": 0.8781143426895142, "learning_rate": 2.177375698734922e-05, "loss": 2.8954, "step": 298000 }, { "epoch": 16.08, "grad_norm": 0.8801673650741577, "learning_rate": 2.1479552809649896e-05, "loss": 2.8745, "step": 299000 }, { "epoch": 16.13, "grad_norm": 0.9237924814224243, "learning_rate": 2.1185642836128276e-05, "loss": 2.8779, "step": 300000 }, { "epoch": 16.19, "grad_norm": 0.9134716391563416, "learning_rate": 2.089143865842895e-05, "loss": 2.8788, "step": 301000 }, { "epoch": 16.24, "grad_norm": 0.9117680191993713, "learning_rate": 2.0597528684907328e-05, "loss": 2.8831, "step": 302000 }, { "epoch": 16.29, "grad_norm": 0.9149305820465088, "learning_rate": 2.0303324507208003e-05, "loss": 2.8851, "step": 303000 }, { "epoch": 16.35, "grad_norm": 0.8993018865585327, "learning_rate": 2.000912032950868e-05, "loss": 2.8821, "step": 304000 }, { "epoch": 16.4, "grad_norm": 0.9155817031860352, "learning_rate": 1.9715210355987058e-05, "loss": 2.885, "step": 305000 }, { "epoch": 16.46, "grad_norm": 0.8847028613090515, "learning_rate": 1.9421006178287732e-05, "loss": 2.887, "step": 306000 }, { "epoch": 16.51, "grad_norm": 0.9059274792671204, "learning_rate": 1.912680200058841e-05, "loss": 2.89, "step": 307000 }, { "epoch": 16.56, "grad_norm": 0.9086124897003174, "learning_rate": 1.8832597822889088e-05, "loss": 2.8899, "step": 308000 }, { "epoch": 16.62, "grad_norm": 0.9034336805343628, "learning_rate": 1.8538687849367462e-05, "loss": 2.8892, "step": 309000 }, { "epoch": 16.67, "grad_norm": 0.9163500666618347, "learning_rate": 1.8244777875845836e-05, "loss": 2.8877, "step": 310000 }, { "epoch": 16.72, "grad_norm": 0.9036159515380859, "learning_rate": 1.7950867902324213e-05, "loss": 2.8946, "step": 311000 }, { "epoch": 16.78, "grad_norm": 0.891083836555481, "learning_rate": 1.765666372462489e-05, "loss": 2.8943, "step": 312000 }, { "epoch": 16.83, "grad_norm": 0.889603853225708, "learning_rate": 1.7362459546925565e-05, "loss": 2.8922, "step": 313000 }, { "epoch": 16.89, "grad_norm": 0.8617958426475525, "learning_rate": 1.7068255369226243e-05, "loss": 2.896, "step": 314000 }, { "epoch": 16.94, "grad_norm": 0.8788795471191406, "learning_rate": 1.677405119152692e-05, "loss": 2.8914, "step": 315000 }, { "epoch": 16.99, "grad_norm": 0.8557174801826477, "learning_rate": 1.64798470138276e-05, "loss": 2.8931, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.4057667571763687, "eval_loss": 3.416048526763916, "eval_runtime": 154.7761, "eval_samples_per_second": 374.225, "eval_steps_per_second": 5.854, "step": 316115 }, { "epoch": 17.05, "grad_norm": 0.8975817561149597, "learning_rate": 1.6185937040305973e-05, "loss": 2.8679, "step": 317000 }, { "epoch": 17.1, "grad_norm": 0.8618032336235046, "learning_rate": 1.589173286260665e-05, "loss": 2.8632, "step": 318000 }, { "epoch": 17.16, "grad_norm": 0.9247516393661499, "learning_rate": 1.5597528684907325e-05, "loss": 2.8628, "step": 319000 }, { "epoch": 17.21, "grad_norm": 0.9377091526985168, "learning_rate": 1.5303618711385702e-05, "loss": 2.8678, "step": 320000 }, { "epoch": 17.26, "grad_norm": 0.8855466842651367, "learning_rate": 1.5009708737864078e-05, "loss": 2.8718, "step": 321000 }, { "epoch": 17.32, "grad_norm": 0.9599144458770752, "learning_rate": 1.4715504560164756e-05, "loss": 2.868, "step": 322000 }, { "epoch": 17.37, "grad_norm": 0.9311389923095703, "learning_rate": 1.4421300382465434e-05, "loss": 2.8744, "step": 323000 }, { "epoch": 17.42, "grad_norm": 0.915178120136261, "learning_rate": 1.4127096204766108e-05, "loss": 2.8702, "step": 324000 }, { "epoch": 17.48, "grad_norm": 0.899299144744873, "learning_rate": 1.3833186231244486e-05, "loss": 2.8738, "step": 325000 }, { "epoch": 17.53, "grad_norm": 0.8956254720687866, "learning_rate": 1.353898205354516e-05, "loss": 2.8688, "step": 326000 }, { "epoch": 17.59, "grad_norm": 0.885826587677002, "learning_rate": 1.3244777875845838e-05, "loss": 2.8747, "step": 327000 }, { "epoch": 17.64, "grad_norm": 0.9065870642662048, "learning_rate": 1.2950573698146514e-05, "loss": 2.8743, "step": 328000 }, { "epoch": 17.69, "grad_norm": 0.9024778604507446, "learning_rate": 1.265666372462489e-05, "loss": 2.877, "step": 329000 }, { "epoch": 17.75, "grad_norm": 0.8808609843254089, "learning_rate": 1.2362459546925568e-05, "loss": 2.8727, "step": 330000 }, { "epoch": 17.8, "grad_norm": 0.9356994032859802, "learning_rate": 1.2068549573403943e-05, "loss": 2.8764, "step": 331000 }, { "epoch": 17.85, "grad_norm": 0.907799243927002, "learning_rate": 1.177434539570462e-05, "loss": 2.8779, "step": 332000 }, { "epoch": 17.91, "grad_norm": 0.9151359796524048, "learning_rate": 1.1480141218005295e-05, "loss": 2.8767, "step": 333000 }, { "epoch": 17.96, "grad_norm": 0.9413455128669739, "learning_rate": 1.118652544866137e-05, "loss": 2.8785, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.4062673719611517, "eval_loss": 3.413921594619751, "eval_runtime": 154.9599, "eval_samples_per_second": 373.78, "eval_steps_per_second": 5.847, "step": 334710 }, { "epoch": 18.02, "grad_norm": 0.926497220993042, "learning_rate": 1.0892321270962048e-05, "loss": 2.8691, "step": 335000 }, { "epoch": 18.07, "grad_norm": 0.9234093427658081, "learning_rate": 1.0598117093262724e-05, "loss": 2.8486, "step": 336000 }, { "epoch": 18.12, "grad_norm": 0.902645468711853, "learning_rate": 1.03042071197411e-05, "loss": 2.852, "step": 337000 }, { "epoch": 18.18, "grad_norm": 0.9143151640892029, "learning_rate": 1.0010002942041776e-05, "loss": 2.854, "step": 338000 }, { "epoch": 18.23, "grad_norm": 0.9151068925857544, "learning_rate": 9.716092968520154e-06, "loss": 2.8546, "step": 339000 }, { "epoch": 18.28, "grad_norm": 0.9283619523048401, "learning_rate": 9.421888790820831e-06, "loss": 2.8563, "step": 340000 }, { "epoch": 18.34, "grad_norm": 0.9281036257743835, "learning_rate": 9.127684613121506e-06, "loss": 2.8524, "step": 341000 }, { "epoch": 18.39, "grad_norm": 0.9401130080223083, "learning_rate": 8.833774639599883e-06, "loss": 2.8571, "step": 342000 }, { "epoch": 18.45, "grad_norm": 0.9209133386611938, "learning_rate": 8.53957046190056e-06, "loss": 2.8612, "step": 343000 }, { "epoch": 18.5, "grad_norm": 0.9208481907844543, "learning_rate": 8.245366284201237e-06, "loss": 2.8564, "step": 344000 }, { "epoch": 18.55, "grad_norm": 0.9125988483428955, "learning_rate": 7.951162106501912e-06, "loss": 2.8604, "step": 345000 }, { "epoch": 18.61, "grad_norm": 0.9358850121498108, "learning_rate": 7.657252132980289e-06, "loss": 2.8611, "step": 346000 }, { "epoch": 18.66, "grad_norm": 0.9063226580619812, "learning_rate": 7.363342159458665e-06, "loss": 2.8621, "step": 347000 }, { "epoch": 18.71, "grad_norm": 0.9140480160713196, "learning_rate": 7.0691379817593416e-06, "loss": 2.8606, "step": 348000 }, { "epoch": 18.77, "grad_norm": 0.9280020594596863, "learning_rate": 6.774933804060019e-06, "loss": 2.8631, "step": 349000 }, { "epoch": 18.82, "grad_norm": 0.9321267604827881, "learning_rate": 6.480729626360695e-06, "loss": 2.8611, "step": 350000 }, { "epoch": 18.88, "grad_norm": 0.9073074460029602, "learning_rate": 6.186819652839071e-06, "loss": 2.8619, "step": 351000 }, { "epoch": 18.93, "grad_norm": 0.9515230059623718, "learning_rate": 5.892615475139747e-06, "loss": 2.8616, "step": 352000 }, { "epoch": 18.98, "grad_norm": 0.9491100311279297, "learning_rate": 5.598705501618123e-06, "loss": 2.8611, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.40620073080891006, "eval_loss": 3.419053316116333, "eval_runtime": 154.2987, "eval_samples_per_second": 375.382, "eval_steps_per_second": 5.872, "step": 353305 }, { "epoch": 19.04, "grad_norm": 0.9301890730857849, "learning_rate": 5.3045013239188e-06, "loss": 2.8451, "step": 354000 }, { "epoch": 19.09, "grad_norm": 0.9505791068077087, "learning_rate": 5.010297146219477e-06, "loss": 2.846, "step": 355000 }, { "epoch": 19.14, "grad_norm": 0.9676011800765991, "learning_rate": 4.716092968520153e-06, "loss": 2.8403, "step": 356000 }, { "epoch": 19.2, "grad_norm": 0.9345271587371826, "learning_rate": 4.422182994998529e-06, "loss": 2.8453, "step": 357000 }, { "epoch": 19.25, "grad_norm": 0.9560821652412415, "learning_rate": 4.128273021476905e-06, "loss": 2.8451, "step": 358000 }, { "epoch": 19.31, "grad_norm": 0.9539549350738525, "learning_rate": 3.834068843777581e-06, "loss": 2.8468, "step": 359000 }, { "epoch": 19.36, "grad_norm": 0.9361310005187988, "learning_rate": 3.5398646660782584e-06, "loss": 2.8483, "step": 360000 }, { "epoch": 19.41, "grad_norm": 0.9409059286117554, "learning_rate": 3.245954692556635e-06, "loss": 2.8457, "step": 361000 }, { "epoch": 19.47, "grad_norm": 0.951979398727417, "learning_rate": 2.951750514857311e-06, "loss": 2.848, "step": 362000 }, { "epoch": 19.52, "grad_norm": 0.9415243864059448, "learning_rate": 2.6575463371579876e-06, "loss": 2.8434, "step": 363000 }, { "epoch": 19.58, "grad_norm": 0.9147111177444458, "learning_rate": 2.363342159458664e-06, "loss": 2.8483, "step": 364000 }, { "epoch": 19.63, "grad_norm": 0.90256667137146, "learning_rate": 2.0694321859370406e-06, "loss": 2.8465, "step": 365000 }, { "epoch": 19.68, "grad_norm": 0.9336329698562622, "learning_rate": 1.7752280082377172e-06, "loss": 2.8446, "step": 366000 }, { "epoch": 19.74, "grad_norm": 0.9325823783874512, "learning_rate": 1.481318034716093e-06, "loss": 2.8465, "step": 367000 }, { "epoch": 19.79, "grad_norm": 0.9516511559486389, "learning_rate": 1.1871138570167696e-06, "loss": 2.8487, "step": 368000 }, { "epoch": 19.84, "grad_norm": 0.9199581742286682, "learning_rate": 8.929096793174464e-07, "loss": 2.8455, "step": 369000 }, { "epoch": 19.9, "grad_norm": 0.9448347687721252, "learning_rate": 5.989997057958223e-07, "loss": 2.8482, "step": 370000 }, { "epoch": 19.95, "grad_norm": 0.9607127904891968, "learning_rate": 3.04795528096499e-07, "loss": 2.8443, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.4062615946031953, "eval_loss": 3.4226667881011963, "eval_runtime": 154.8159, "eval_samples_per_second": 374.128, "eval_steps_per_second": 5.852, "step": 371900 }, { "epoch": 20.0, "step": 371900, "total_flos": 1.5669257538816e+18, "train_loss": 3.1573533160585843, "train_runtime": 81388.648, "train_samples_per_second": 146.221, "train_steps_per_second": 4.569 } ], "logging_steps": 1000, "max_steps": 371900, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.5669257538816e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }