{ "best_metric": 5.700723648071289, "best_model_checkpoint": "./results/models/checkpoint-101465", "epoch": 13.0, "eval_steps": 500, "global_step": 101465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 0.001997437540038437, "loss": 5.9174, "step": 500 }, { "epoch": 0.13, "learning_rate": 0.0019948750800768736, "loss": 5.8554, "step": 1000 }, { "epoch": 0.19, "learning_rate": 0.0019923126201153107, "loss": 5.8681, "step": 1500 }, { "epoch": 0.26, "learning_rate": 0.0019897501601537477, "loss": 5.8742, "step": 2000 }, { "epoch": 0.32, "learning_rate": 0.0019871877001921847, "loss": 5.8751, "step": 2500 }, { "epoch": 0.38, "learning_rate": 0.0019846252402306217, "loss": 5.8581, "step": 3000 }, { "epoch": 0.45, "learning_rate": 0.0019820627802690583, "loss": 5.8585, "step": 3500 }, { "epoch": 0.51, "learning_rate": 0.0019795003203074953, "loss": 5.851, "step": 4000 }, { "epoch": 0.58, "learning_rate": 0.001976937860345932, "loss": 5.8471, "step": 4500 }, { "epoch": 0.64, "learning_rate": 0.001974375400384369, "loss": 5.8427, "step": 5000 }, { "epoch": 0.7, "learning_rate": 0.001971812940422806, "loss": 5.8416, "step": 5500 }, { "epoch": 0.77, "learning_rate": 0.001969250480461243, "loss": 5.8367, "step": 6000 }, { "epoch": 0.83, "learning_rate": 0.00196668802049968, "loss": 5.839, "step": 6500 }, { "epoch": 0.9, "learning_rate": 0.0019641255605381165, "loss": 5.8303, "step": 7000 }, { "epoch": 0.96, "learning_rate": 0.0019615631005765535, "loss": 5.8259, "step": 7500 }, { "epoch": 1.0, "eval_loss": 5.826081275939941, "eval_runtime": 2.7402, "eval_samples_per_second": 364.938, "eval_steps_per_second": 2.92, "step": 7805 }, { "epoch": 1.02, "learning_rate": 0.0019590006406149905, "loss": 5.8257, "step": 8000 }, { "epoch": 1.09, "learning_rate": 0.001956438180653427, "loss": 5.8125, "step": 8500 }, { "epoch": 1.15, "learning_rate": 0.001953875720691864, "loss": 5.8103, "step": 9000 }, { "epoch": 1.22, "learning_rate": 0.001951313260730301, "loss": 5.8115, "step": 9500 }, { "epoch": 1.28, "learning_rate": 0.001948750800768738, "loss": 5.8042, "step": 10000 }, { "epoch": 1.35, "learning_rate": 0.001946188340807175, "loss": 5.8003, "step": 10500 }, { "epoch": 1.41, "learning_rate": 0.0019436258808456118, "loss": 5.7984, "step": 11000 }, { "epoch": 1.47, "learning_rate": 0.0019410634208840488, "loss": 5.7964, "step": 11500 }, { "epoch": 1.54, "learning_rate": 0.0019385009609224858, "loss": 5.7938, "step": 12000 }, { "epoch": 1.6, "learning_rate": 0.0019359385009609226, "loss": 5.789, "step": 12500 }, { "epoch": 1.67, "learning_rate": 0.0019333760409993594, "loss": 5.785, "step": 13000 }, { "epoch": 1.73, "learning_rate": 0.0019308135810377962, "loss": 5.7901, "step": 13500 }, { "epoch": 1.79, "learning_rate": 0.0019282511210762332, "loss": 5.7844, "step": 14000 }, { "epoch": 1.86, "learning_rate": 0.0019256886611146702, "loss": 5.7802, "step": 14500 }, { "epoch": 1.92, "learning_rate": 0.001923126201153107, "loss": 5.7819, "step": 15000 }, { "epoch": 1.99, "learning_rate": 0.001920563741191544, "loss": 5.7801, "step": 15500 }, { "epoch": 2.0, "eval_loss": 5.780016899108887, "eval_runtime": 2.7486, "eval_samples_per_second": 363.821, "eval_steps_per_second": 2.911, "step": 15610 }, { "epoch": 2.05, "learning_rate": 0.0019180012812299808, "loss": 5.7782, "step": 16000 }, { "epoch": 2.11, "learning_rate": 0.0019154388212684176, "loss": 5.7768, "step": 16500 }, { "epoch": 2.18, "learning_rate": 0.0019128763613068546, "loss": 5.7762, "step": 17000 }, { "epoch": 2.24, "learning_rate": 0.0019103139013452914, "loss": 5.7759, "step": 17500 }, { "epoch": 2.31, "learning_rate": 0.0019077514413837285, "loss": 5.7745, "step": 18000 }, { "epoch": 2.37, "learning_rate": 0.0019051889814221653, "loss": 5.7711, "step": 18500 }, { "epoch": 2.43, "learning_rate": 0.0019026265214606023, "loss": 5.7691, "step": 19000 }, { "epoch": 2.5, "learning_rate": 0.0019000640614990393, "loss": 5.7619, "step": 19500 }, { "epoch": 2.56, "learning_rate": 0.0018975016015374759, "loss": 5.7638, "step": 20000 }, { "epoch": 2.63, "learning_rate": 0.0018949391415759129, "loss": 5.7628, "step": 20500 }, { "epoch": 2.69, "learning_rate": 0.0018923766816143497, "loss": 5.7595, "step": 21000 }, { "epoch": 2.75, "learning_rate": 0.0018898142216527867, "loss": 5.762, "step": 21500 }, { "epoch": 2.82, "learning_rate": 0.0018872517616912237, "loss": 5.7616, "step": 22000 }, { "epoch": 2.88, "learning_rate": 0.0018846893017296605, "loss": 5.769, "step": 22500 }, { "epoch": 2.95, "learning_rate": 0.0018821268417680975, "loss": 5.7605, "step": 23000 }, { "epoch": 3.0, "eval_loss": 5.762243270874023, "eval_runtime": 2.7574, "eval_samples_per_second": 362.657, "eval_steps_per_second": 2.901, "step": 23415 }, { "epoch": 3.01, "learning_rate": 0.0018795643818065343, "loss": 5.7585, "step": 23500 }, { "epoch": 3.07, "learning_rate": 0.0018770019218449711, "loss": 5.7596, "step": 24000 }, { "epoch": 3.14, "learning_rate": 0.0018744394618834081, "loss": 5.7545, "step": 24500 }, { "epoch": 3.2, "learning_rate": 0.001871877001921845, "loss": 5.7538, "step": 25000 }, { "epoch": 3.27, "learning_rate": 0.001869314541960282, "loss": 5.7587, "step": 25500 }, { "epoch": 3.33, "learning_rate": 0.0018667520819987187, "loss": 5.7575, "step": 26000 }, { "epoch": 3.4, "learning_rate": 0.0018641896220371558, "loss": 5.7585, "step": 26500 }, { "epoch": 3.46, "learning_rate": 0.0018616271620755928, "loss": 5.7565, "step": 27000 }, { "epoch": 3.52, "learning_rate": 0.0018590647021140294, "loss": 5.7532, "step": 27500 }, { "epoch": 3.59, "learning_rate": 0.0018565022421524664, "loss": 5.7532, "step": 28000 }, { "epoch": 3.65, "learning_rate": 0.0018539397821909032, "loss": 5.7537, "step": 28500 }, { "epoch": 3.72, "learning_rate": 0.0018513773222293402, "loss": 5.753, "step": 29000 }, { "epoch": 3.78, "learning_rate": 0.0018488148622677772, "loss": 5.7499, "step": 29500 }, { "epoch": 3.84, "learning_rate": 0.001846252402306214, "loss": 5.748, "step": 30000 }, { "epoch": 3.91, "learning_rate": 0.001843689942344651, "loss": 5.7494, "step": 30500 }, { "epoch": 3.97, "learning_rate": 0.0018411274823830876, "loss": 5.7497, "step": 31000 }, { "epoch": 4.0, "eval_loss": 5.748880863189697, "eval_runtime": 2.7417, "eval_samples_per_second": 364.734, "eval_steps_per_second": 2.918, "step": 31220 }, { "epoch": 4.04, "learning_rate": 0.0018385650224215246, "loss": 5.746, "step": 31500 }, { "epoch": 4.1, "learning_rate": 0.0018360025624599616, "loss": 5.744, "step": 32000 }, { "epoch": 4.16, "learning_rate": 0.0018334401024983984, "loss": 5.7402, "step": 32500 }, { "epoch": 4.23, "learning_rate": 0.0018308776425368354, "loss": 5.7429, "step": 33000 }, { "epoch": 4.29, "learning_rate": 0.0018283151825752725, "loss": 5.7466, "step": 33500 }, { "epoch": 4.36, "learning_rate": 0.0018257527226137093, "loss": 5.7444, "step": 34000 }, { "epoch": 4.42, "learning_rate": 0.0018231902626521463, "loss": 5.744, "step": 34500 }, { "epoch": 4.48, "learning_rate": 0.0018206278026905828, "loss": 5.7411, "step": 35000 }, { "epoch": 4.55, "learning_rate": 0.0018180653427290199, "loss": 5.741, "step": 35500 }, { "epoch": 4.61, "learning_rate": 0.0018155028827674569, "loss": 5.7425, "step": 36000 }, { "epoch": 4.68, "learning_rate": 0.0018129404228058937, "loss": 5.7388, "step": 36500 }, { "epoch": 4.74, "learning_rate": 0.0018103779628443307, "loss": 5.7413, "step": 37000 }, { "epoch": 4.8, "learning_rate": 0.0018078155028827675, "loss": 5.7428, "step": 37500 }, { "epoch": 4.87, "learning_rate": 0.0018052530429212045, "loss": 5.7419, "step": 38000 }, { "epoch": 4.93, "learning_rate": 0.0018026905829596413, "loss": 5.7407, "step": 38500 }, { "epoch": 5.0, "learning_rate": 0.001800128122998078, "loss": 5.7419, "step": 39000 }, { "epoch": 5.0, "eval_loss": 5.745668411254883, "eval_runtime": 2.7372, "eval_samples_per_second": 365.338, "eval_steps_per_second": 2.923, "step": 39025 }, { "epoch": 5.06, "learning_rate": 0.0017975656630365151, "loss": 5.7408, "step": 39500 }, { "epoch": 5.12, "learning_rate": 0.001795003203074952, "loss": 5.738, "step": 40000 }, { "epoch": 5.19, "learning_rate": 0.001792440743113389, "loss": 5.7374, "step": 40500 }, { "epoch": 5.25, "learning_rate": 0.001789878283151826, "loss": 5.7356, "step": 41000 }, { "epoch": 5.32, "learning_rate": 0.0017873158231902627, "loss": 5.7342, "step": 41500 }, { "epoch": 5.38, "learning_rate": 0.0017847533632286995, "loss": 5.7325, "step": 42000 }, { "epoch": 5.45, "learning_rate": 0.0017821909032671363, "loss": 5.733, "step": 42500 }, { "epoch": 5.51, "learning_rate": 0.0017796284433055734, "loss": 5.7308, "step": 43000 }, { "epoch": 5.57, "learning_rate": 0.0017770659833440104, "loss": 5.7293, "step": 43500 }, { "epoch": 5.64, "learning_rate": 0.0017745035233824472, "loss": 5.7285, "step": 44000 }, { "epoch": 5.7, "learning_rate": 0.0017719410634208842, "loss": 5.7295, "step": 44500 }, { "epoch": 5.77, "learning_rate": 0.001769378603459321, "loss": 5.7277, "step": 45000 }, { "epoch": 5.83, "learning_rate": 0.0017668161434977578, "loss": 5.729, "step": 45500 }, { "epoch": 5.89, "learning_rate": 0.0017642536835361948, "loss": 5.7304, "step": 46000 }, { "epoch": 5.96, "learning_rate": 0.0017616912235746316, "loss": 5.7266, "step": 46500 }, { "epoch": 6.0, "eval_loss": 5.728877067565918, "eval_runtime": 2.7253, "eval_samples_per_second": 366.932, "eval_steps_per_second": 2.935, "step": 46830 }, { "epoch": 6.02, "learning_rate": 0.0017591287636130686, "loss": 5.726, "step": 47000 }, { "epoch": 6.09, "learning_rate": 0.0017565663036515054, "loss": 5.7246, "step": 47500 }, { "epoch": 6.15, "learning_rate": 0.0017540038436899424, "loss": 5.7244, "step": 48000 }, { "epoch": 6.21, "learning_rate": 0.0017514413837283794, "loss": 5.7297, "step": 48500 }, { "epoch": 6.28, "learning_rate": 0.001748878923766816, "loss": 5.7275, "step": 49000 }, { "epoch": 6.34, "learning_rate": 0.001746316463805253, "loss": 5.7255, "step": 49500 }, { "epoch": 6.41, "learning_rate": 0.0017437540038436898, "loss": 5.7255, "step": 50000 }, { "epoch": 6.47, "learning_rate": 0.0017411915438821268, "loss": 5.7233, "step": 50500 }, { "epoch": 6.53, "learning_rate": 0.0017386290839205639, "loss": 5.7238, "step": 51000 }, { "epoch": 6.6, "learning_rate": 0.0017360666239590007, "loss": 5.7246, "step": 51500 }, { "epoch": 6.66, "learning_rate": 0.0017335041639974377, "loss": 5.7229, "step": 52000 }, { "epoch": 6.73, "learning_rate": 0.0017309417040358745, "loss": 5.7221, "step": 52500 }, { "epoch": 6.79, "learning_rate": 0.0017283792440743113, "loss": 5.7184, "step": 53000 }, { "epoch": 6.85, "learning_rate": 0.0017258167841127483, "loss": 5.7186, "step": 53500 }, { "epoch": 6.92, "learning_rate": 0.001723254324151185, "loss": 5.7164, "step": 54000 }, { "epoch": 6.98, "learning_rate": 0.001720691864189622, "loss": 5.7184, "step": 54500 }, { "epoch": 7.0, "eval_loss": 5.722599029541016, "eval_runtime": 2.7479, "eval_samples_per_second": 363.909, "eval_steps_per_second": 2.911, "step": 54635 }, { "epoch": 7.05, "learning_rate": 0.0017181294042280591, "loss": 5.7188, "step": 55000 }, { "epoch": 7.11, "learning_rate": 0.001715566944266496, "loss": 5.7174, "step": 55500 }, { "epoch": 7.17, "learning_rate": 0.001713004484304933, "loss": 5.719, "step": 56000 }, { "epoch": 7.24, "learning_rate": 0.0017104420243433695, "loss": 5.7205, "step": 56500 }, { "epoch": 7.3, "learning_rate": 0.0017078795643818065, "loss": 5.7193, "step": 57000 }, { "epoch": 7.37, "learning_rate": 0.0017053171044202435, "loss": 5.7191, "step": 57500 }, { "epoch": 7.43, "learning_rate": 0.0017027546444586803, "loss": 5.72, "step": 58000 }, { "epoch": 7.5, "learning_rate": 0.0017001921844971173, "loss": 5.7217, "step": 58500 }, { "epoch": 7.56, "learning_rate": 0.0016976297245355541, "loss": 5.7185, "step": 59000 }, { "epoch": 7.62, "learning_rate": 0.0016950672645739912, "loss": 5.7162, "step": 59500 }, { "epoch": 7.69, "learning_rate": 0.001692504804612428, "loss": 5.7192, "step": 60000 }, { "epoch": 7.75, "learning_rate": 0.0016899423446508648, "loss": 5.7198, "step": 60500 }, { "epoch": 7.82, "learning_rate": 0.0016873798846893018, "loss": 5.7178, "step": 61000 }, { "epoch": 7.88, "learning_rate": 0.0016848174247277386, "loss": 5.7165, "step": 61500 }, { "epoch": 7.94, "learning_rate": 0.0016822549647661756, "loss": 5.7171, "step": 62000 }, { "epoch": 8.0, "eval_loss": 5.719506740570068, "eval_runtime": 2.7211, "eval_samples_per_second": 367.5, "eval_steps_per_second": 2.94, "step": 62440 }, { "epoch": 8.01, "learning_rate": 0.0016796925048046126, "loss": 5.7152, "step": 62500 }, { "epoch": 8.07, "learning_rate": 0.0016771300448430494, "loss": 5.7136, "step": 63000 }, { "epoch": 8.14, "learning_rate": 0.0016745675848814864, "loss": 5.7135, "step": 63500 }, { "epoch": 8.2, "learning_rate": 0.001672005124919923, "loss": 5.7165, "step": 64000 }, { "epoch": 8.26, "learning_rate": 0.00166944266495836, "loss": 5.7167, "step": 64500 }, { "epoch": 8.33, "learning_rate": 0.001666880204996797, "loss": 5.7127, "step": 65000 }, { "epoch": 8.39, "learning_rate": 0.0016643177450352338, "loss": 5.7157, "step": 65500 }, { "epoch": 8.46, "learning_rate": 0.0016617552850736708, "loss": 5.7133, "step": 66000 }, { "epoch": 8.52, "learning_rate": 0.0016591928251121076, "loss": 5.7132, "step": 66500 }, { "epoch": 8.58, "learning_rate": 0.0016566303651505446, "loss": 5.713, "step": 67000 }, { "epoch": 8.65, "learning_rate": 0.0016540679051889814, "loss": 5.7163, "step": 67500 }, { "epoch": 8.71, "learning_rate": 0.0016515054452274182, "loss": 5.7174, "step": 68000 }, { "epoch": 8.78, "learning_rate": 0.0016489429852658553, "loss": 5.7185, "step": 68500 }, { "epoch": 8.84, "learning_rate": 0.001646380525304292, "loss": 5.7133, "step": 69000 }, { "epoch": 8.9, "learning_rate": 0.001643818065342729, "loss": 5.7148, "step": 69500 }, { "epoch": 8.97, "learning_rate": 0.001641255605381166, "loss": 5.7123, "step": 70000 }, { "epoch": 9.0, "eval_loss": 5.7155256271362305, "eval_runtime": 2.7307, "eval_samples_per_second": 366.201, "eval_steps_per_second": 2.93, "step": 70245 }, { "epoch": 9.03, "learning_rate": 0.0016386931454196029, "loss": 5.7097, "step": 70500 }, { "epoch": 9.1, "learning_rate": 0.0016361306854580397, "loss": 5.7112, "step": 71000 }, { "epoch": 9.16, "learning_rate": 0.0016335682254964765, "loss": 5.7103, "step": 71500 }, { "epoch": 9.22, "learning_rate": 0.0016310057655349135, "loss": 5.7085, "step": 72000 }, { "epoch": 9.29, "learning_rate": 0.0016284433055733505, "loss": 5.7077, "step": 72500 }, { "epoch": 9.35, "learning_rate": 0.0016258808456117873, "loss": 5.7101, "step": 73000 }, { "epoch": 9.42, "learning_rate": 0.0016233183856502243, "loss": 5.7085, "step": 73500 }, { "epoch": 9.48, "learning_rate": 0.0016207559256886613, "loss": 5.7091, "step": 74000 }, { "epoch": 9.55, "learning_rate": 0.001618193465727098, "loss": 5.7098, "step": 74500 }, { "epoch": 9.61, "learning_rate": 0.001615631005765535, "loss": 5.7136, "step": 75000 }, { "epoch": 9.67, "learning_rate": 0.0016130685458039717, "loss": 5.7154, "step": 75500 }, { "epoch": 9.74, "learning_rate": 0.0016105060858424087, "loss": 5.7117, "step": 76000 }, { "epoch": 9.8, "learning_rate": 0.0016079436258808458, "loss": 5.713, "step": 76500 }, { "epoch": 9.87, "learning_rate": 0.0016053811659192826, "loss": 5.7138, "step": 77000 }, { "epoch": 9.93, "learning_rate": 0.0016028187059577196, "loss": 5.7177, "step": 77500 }, { "epoch": 9.99, "learning_rate": 0.0016002562459961564, "loss": 5.7123, "step": 78000 }, { "epoch": 10.0, "eval_loss": 5.717950344085693, "eval_runtime": 2.833, "eval_samples_per_second": 352.98, "eval_steps_per_second": 2.824, "step": 78050 }, { "epoch": 10.06, "learning_rate": 0.0015976937860345932, "loss": 5.7121, "step": 78500 }, { "epoch": 10.12, "learning_rate": 0.0015951313260730302, "loss": 5.7098, "step": 79000 }, { "epoch": 10.19, "learning_rate": 0.001592568866111467, "loss": 5.7113, "step": 79500 }, { "epoch": 10.25, "learning_rate": 0.001590006406149904, "loss": 5.7108, "step": 80000 }, { "epoch": 10.31, "learning_rate": 0.0015874439461883408, "loss": 5.7136, "step": 80500 }, { "epoch": 10.38, "learning_rate": 0.0015848814862267778, "loss": 5.711, "step": 81000 }, { "epoch": 10.44, "learning_rate": 0.0015823190262652148, "loss": 5.7088, "step": 81500 }, { "epoch": 10.51, "learning_rate": 0.0015797565663036514, "loss": 5.7096, "step": 82000 }, { "epoch": 10.57, "learning_rate": 0.0015771941063420884, "loss": 5.7069, "step": 82500 }, { "epoch": 10.63, "learning_rate": 0.0015746316463805252, "loss": 5.7056, "step": 83000 }, { "epoch": 10.7, "learning_rate": 0.0015720691864189622, "loss": 5.7049, "step": 83500 }, { "epoch": 10.76, "learning_rate": 0.0015695067264573993, "loss": 5.7059, "step": 84000 }, { "epoch": 10.83, "learning_rate": 0.001566944266495836, "loss": 5.7057, "step": 84500 }, { "epoch": 10.89, "learning_rate": 0.001564381806534273, "loss": 5.7048, "step": 85000 }, { "epoch": 10.95, "learning_rate": 0.0015618193465727096, "loss": 5.7071, "step": 85500 }, { "epoch": 11.0, "eval_loss": 5.710795879364014, "eval_runtime": 2.7348, "eval_samples_per_second": 365.653, "eval_steps_per_second": 2.925, "step": 85855 }, { "epoch": 11.02, "learning_rate": 0.0015592568866111467, "loss": 5.7076, "step": 86000 }, { "epoch": 11.08, "learning_rate": 0.0015566944266495837, "loss": 5.7062, "step": 86500 }, { "epoch": 11.15, "learning_rate": 0.0015541319666880205, "loss": 5.7041, "step": 87000 }, { "epoch": 11.21, "learning_rate": 0.0015515695067264575, "loss": 5.7024, "step": 87500 }, { "epoch": 11.27, "learning_rate": 0.0015490070467648943, "loss": 5.7024, "step": 88000 }, { "epoch": 11.34, "learning_rate": 0.0015464445868033313, "loss": 5.7031, "step": 88500 }, { "epoch": 11.4, "learning_rate": 0.001543882126841768, "loss": 5.7031, "step": 89000 }, { "epoch": 11.47, "learning_rate": 0.001541319666880205, "loss": 5.7028, "step": 89500 }, { "epoch": 11.53, "learning_rate": 0.001538757206918642, "loss": 5.6996, "step": 90000 }, { "epoch": 11.6, "learning_rate": 0.0015361947469570787, "loss": 5.7012, "step": 90500 }, { "epoch": 11.66, "learning_rate": 0.0015336322869955157, "loss": 5.7018, "step": 91000 }, { "epoch": 11.72, "learning_rate": 0.0015310698270339527, "loss": 5.7001, "step": 91500 }, { "epoch": 11.79, "learning_rate": 0.0015285073670723895, "loss": 5.7005, "step": 92000 }, { "epoch": 11.85, "learning_rate": 0.0015259449071108266, "loss": 5.7001, "step": 92500 }, { "epoch": 11.92, "learning_rate": 0.0015233824471492631, "loss": 5.6995, "step": 93000 }, { "epoch": 11.98, "learning_rate": 0.0015208199871877002, "loss": 5.7, "step": 93500 }, { "epoch": 12.0, "eval_loss": 5.703713893890381, "eval_runtime": 2.754, "eval_samples_per_second": 363.115, "eval_steps_per_second": 2.905, "step": 93660 }, { "epoch": 12.04, "learning_rate": 0.0015182575272261372, "loss": 5.7018, "step": 94000 }, { "epoch": 12.11, "learning_rate": 0.001515695067264574, "loss": 5.6988, "step": 94500 }, { "epoch": 12.17, "learning_rate": 0.001513132607303011, "loss": 5.6973, "step": 95000 }, { "epoch": 12.24, "learning_rate": 0.001510570147341448, "loss": 5.6993, "step": 95500 }, { "epoch": 12.3, "learning_rate": 0.0015080076873798848, "loss": 5.6985, "step": 96000 }, { "epoch": 12.36, "learning_rate": 0.0015054452274183216, "loss": 5.6973, "step": 96500 }, { "epoch": 12.43, "learning_rate": 0.0015028827674567584, "loss": 5.6965, "step": 97000 }, { "epoch": 12.49, "learning_rate": 0.0015003203074951954, "loss": 5.6981, "step": 97500 }, { "epoch": 12.56, "learning_rate": 0.0014977578475336324, "loss": 5.6958, "step": 98000 }, { "epoch": 12.62, "learning_rate": 0.0014951953875720692, "loss": 5.6971, "step": 98500 }, { "epoch": 12.68, "learning_rate": 0.0014926329276105062, "loss": 5.697, "step": 99000 }, { "epoch": 12.75, "learning_rate": 0.001490070467648943, "loss": 5.6977, "step": 99500 }, { "epoch": 12.81, "learning_rate": 0.0014875080076873798, "loss": 5.699, "step": 100000 }, { "epoch": 12.88, "learning_rate": 0.0014849455477258168, "loss": 5.6974, "step": 100500 }, { "epoch": 12.94, "learning_rate": 0.0014823830877642536, "loss": 5.6984, "step": 101000 }, { "epoch": 13.0, "eval_loss": 5.700723648071289, "eval_runtime": 2.8976, "eval_samples_per_second": 345.111, "eval_steps_per_second": 2.761, "step": 101465 } ], "logging_steps": 500, "max_steps": 390250, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 2.652474175782912e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }