{ "best_metric": 0.6671983599662781, "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle30b/checkpoint-13800", "epoch": 2.9773462783171523, "global_step": 13800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.9999999999999995e-05, "loss": 1.6143, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.00011999999999999999, "loss": 1.2447, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.00017699999999999997, "loss": 0.9529, "step": 60 }, { "epoch": 0.02, "learning_rate": 0.000237, "loss": 0.8899, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.00029699999999999996, "loss": 0.8614, "step": 100 }, { "epoch": 0.03, "learning_rate": 0.00029958710612097066, "loss": 0.8402, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.00029915248098515027, "loss": 0.8335, "step": 140 }, { "epoch": 0.03, "learning_rate": 0.00029871785584932993, "loss": 0.8303, "step": 160 }, { "epoch": 0.04, "learning_rate": 0.0002982832307135096, "loss": 0.8261, "step": 180 }, { "epoch": 0.04, "learning_rate": 0.0002978486055776892, "loss": 0.807, "step": 200 }, { "epoch": 0.04, "eval_loss": 0.8271128535270691, "eval_runtime": 49.877, "eval_samples_per_second": 40.099, "eval_steps_per_second": 0.642, "step": 200 }, { "epoch": 0.05, "learning_rate": 0.00029741398044186887, "loss": 0.808, "step": 220 }, { "epoch": 0.05, "learning_rate": 0.00029697935530604853, "loss": 0.8092, "step": 240 }, { "epoch": 0.06, "learning_rate": 0.00029654473017022814, "loss": 0.8045, "step": 260 }, { "epoch": 0.06, "learning_rate": 0.0002961101050344078, "loss": 0.8007, "step": 280 }, { "epoch": 0.06, "learning_rate": 0.00029567547989858747, "loss": 0.793, "step": 300 }, { "epoch": 0.07, "learning_rate": 0.0002952408547627671, "loss": 0.7886, "step": 320 }, { "epoch": 0.07, "learning_rate": 0.00029480622962694674, "loss": 0.7854, "step": 340 }, { "epoch": 0.08, "learning_rate": 0.0002943716044911264, "loss": 0.783, "step": 360 }, { "epoch": 0.08, "learning_rate": 0.000293936979355306, "loss": 0.7797, "step": 380 }, { "epoch": 0.09, "learning_rate": 0.0002935023542194857, "loss": 0.7801, "step": 400 }, { "epoch": 0.09, "eval_loss": 0.793747067451477, "eval_runtime": 49.8962, "eval_samples_per_second": 40.083, "eval_steps_per_second": 0.641, "step": 400 }, { "epoch": 0.09, "learning_rate": 0.00029306772908366534, "loss": 0.7879, "step": 420 }, { "epoch": 0.09, "learning_rate": 0.00029263310394784495, "loss": 0.7745, "step": 440 }, { "epoch": 0.1, "learning_rate": 0.0002921984788120246, "loss": 0.7725, "step": 460 }, { "epoch": 0.1, "learning_rate": 0.0002917638536762043, "loss": 0.7659, "step": 480 }, { "epoch": 0.11, "learning_rate": 0.0002913292285403839, "loss": 0.7658, "step": 500 }, { "epoch": 0.11, "learning_rate": 0.00029089460340456355, "loss": 0.7722, "step": 520 }, { "epoch": 0.12, "learning_rate": 0.0002904599782687432, "loss": 0.773, "step": 540 }, { "epoch": 0.12, "learning_rate": 0.0002900253531329228, "loss": 0.7749, "step": 560 }, { "epoch": 0.13, "learning_rate": 0.0002895907279971025, "loss": 0.7734, "step": 580 }, { "epoch": 0.13, "learning_rate": 0.00028915610286128215, "loss": 0.7607, "step": 600 }, { "epoch": 0.13, "eval_loss": 0.7771433591842651, "eval_runtime": 49.9486, "eval_samples_per_second": 40.041, "eval_steps_per_second": 0.641, "step": 600 }, { "epoch": 0.13, "learning_rate": 0.00028872147772546176, "loss": 0.7657, "step": 620 }, { "epoch": 0.14, "learning_rate": 0.00028828685258964137, "loss": 0.7602, "step": 640 }, { "epoch": 0.14, "learning_rate": 0.00028785222745382103, "loss": 0.7619, "step": 660 }, { "epoch": 0.15, "learning_rate": 0.0002874176023180007, "loss": 0.7587, "step": 680 }, { "epoch": 0.15, "learning_rate": 0.0002869829771821803, "loss": 0.7553, "step": 700 }, { "epoch": 0.16, "learning_rate": 0.00028654835204635997, "loss": 0.7565, "step": 720 }, { "epoch": 0.16, "learning_rate": 0.00028611372691053963, "loss": 0.7586, "step": 740 }, { "epoch": 0.16, "learning_rate": 0.00028567910177471924, "loss": 0.7556, "step": 760 }, { "epoch": 0.17, "learning_rate": 0.0002852444766388989, "loss": 0.7487, "step": 780 }, { "epoch": 0.17, "learning_rate": 0.00028480985150307857, "loss": 0.7516, "step": 800 }, { "epoch": 0.17, "eval_loss": 0.7632888555526733, "eval_runtime": 49.913, "eval_samples_per_second": 40.07, "eval_steps_per_second": 0.641, "step": 800 }, { "epoch": 0.18, "learning_rate": 0.0002843752263672582, "loss": 0.7527, "step": 820 }, { "epoch": 0.18, "learning_rate": 0.00028394060123143784, "loss": 0.7407, "step": 840 }, { "epoch": 0.19, "learning_rate": 0.0002835059760956175, "loss": 0.744, "step": 860 }, { "epoch": 0.19, "learning_rate": 0.0002830713509597971, "loss": 0.7456, "step": 880 }, { "epoch": 0.19, "learning_rate": 0.0002826367258239768, "loss": 0.7429, "step": 900 }, { "epoch": 0.2, "learning_rate": 0.00028220210068815644, "loss": 0.7516, "step": 920 }, { "epoch": 0.2, "learning_rate": 0.00028176747555233605, "loss": 0.7381, "step": 940 }, { "epoch": 0.21, "learning_rate": 0.0002813328504165157, "loss": 0.7256, "step": 960 }, { "epoch": 0.21, "learning_rate": 0.0002808982252806954, "loss": 0.7443, "step": 980 }, { "epoch": 0.22, "learning_rate": 0.000280463600144875, "loss": 0.7389, "step": 1000 }, { "epoch": 0.22, "eval_loss": 0.7532852292060852, "eval_runtime": 49.9829, "eval_samples_per_second": 40.014, "eval_steps_per_second": 0.64, "step": 1000 }, { "epoch": 0.22, "learning_rate": 0.00028002897500905465, "loss": 0.7374, "step": 1020 }, { "epoch": 0.22, "learning_rate": 0.0002795943498732343, "loss": 0.7296, "step": 1040 }, { "epoch": 0.23, "learning_rate": 0.0002791597247374139, "loss": 0.7424, "step": 1060 }, { "epoch": 0.23, "learning_rate": 0.0002787250996015936, "loss": 0.7328, "step": 1080 }, { "epoch": 0.24, "learning_rate": 0.00027829047446577325, "loss": 0.7367, "step": 1100 }, { "epoch": 0.24, "learning_rate": 0.00027785584932995286, "loss": 0.7419, "step": 1120 }, { "epoch": 0.25, "learning_rate": 0.0002774212241941325, "loss": 0.7347, "step": 1140 }, { "epoch": 0.25, "learning_rate": 0.0002769865990583122, "loss": 0.7292, "step": 1160 }, { "epoch": 0.25, "learning_rate": 0.0002765519739224918, "loss": 0.7394, "step": 1180 }, { "epoch": 0.26, "learning_rate": 0.00027611734878667146, "loss": 0.7358, "step": 1200 }, { "epoch": 0.26, "eval_loss": 0.7463639974594116, "eval_runtime": 49.9963, "eval_samples_per_second": 40.003, "eval_steps_per_second": 0.64, "step": 1200 }, { "epoch": 0.26, "learning_rate": 0.0002756827236508511, "loss": 0.7266, "step": 1220 }, { "epoch": 0.27, "learning_rate": 0.00027524809851503073, "loss": 0.7336, "step": 1240 }, { "epoch": 0.27, "learning_rate": 0.0002748134733792104, "loss": 0.7296, "step": 1260 }, { "epoch": 0.28, "learning_rate": 0.00027437884824339006, "loss": 0.73, "step": 1280 }, { "epoch": 0.28, "learning_rate": 0.00027394422310756967, "loss": 0.7312, "step": 1300 }, { "epoch": 0.28, "learning_rate": 0.00027350959797174933, "loss": 0.7307, "step": 1320 }, { "epoch": 0.29, "learning_rate": 0.000273074972835929, "loss": 0.7246, "step": 1340 }, { "epoch": 0.29, "learning_rate": 0.0002726403477001086, "loss": 0.7299, "step": 1360 }, { "epoch": 0.3, "learning_rate": 0.00027220572256428827, "loss": 0.7251, "step": 1380 }, { "epoch": 0.3, "learning_rate": 0.00027177109742846793, "loss": 0.7286, "step": 1400 }, { "epoch": 0.3, "eval_loss": 0.7393819093704224, "eval_runtime": 49.9896, "eval_samples_per_second": 40.008, "eval_steps_per_second": 0.64, "step": 1400 }, { "epoch": 0.31, "learning_rate": 0.00027133647229264754, "loss": 0.7186, "step": 1420 }, { "epoch": 0.31, "learning_rate": 0.0002709018471568272, "loss": 0.7215, "step": 1440 }, { "epoch": 0.31, "learning_rate": 0.00027046722202100687, "loss": 0.7295, "step": 1460 }, { "epoch": 0.32, "learning_rate": 0.0002700325968851865, "loss": 0.7198, "step": 1480 }, { "epoch": 0.32, "learning_rate": 0.00026959797174936614, "loss": 0.7184, "step": 1500 }, { "epoch": 0.33, "learning_rate": 0.0002691633466135458, "loss": 0.7283, "step": 1520 }, { "epoch": 0.33, "learning_rate": 0.0002687287214777254, "loss": 0.7378, "step": 1540 }, { "epoch": 0.34, "learning_rate": 0.0002682940963419051, "loss": 0.7196, "step": 1560 }, { "epoch": 0.34, "learning_rate": 0.00026785947120608474, "loss": 0.7152, "step": 1580 }, { "epoch": 0.35, "learning_rate": 0.00026742484607026435, "loss": 0.7184, "step": 1600 }, { "epoch": 0.35, "eval_loss": 0.7342154383659363, "eval_runtime": 49.9957, "eval_samples_per_second": 40.003, "eval_steps_per_second": 0.64, "step": 1600 }, { "epoch": 0.35, "learning_rate": 0.000266990220934444, "loss": 0.7164, "step": 1620 }, { "epoch": 0.35, "learning_rate": 0.0002665555957986237, "loss": 0.7136, "step": 1640 }, { "epoch": 0.36, "learning_rate": 0.0002661209706628033, "loss": 0.7203, "step": 1660 }, { "epoch": 0.36, "learning_rate": 0.00026568634552698295, "loss": 0.7158, "step": 1680 }, { "epoch": 0.37, "learning_rate": 0.0002652517203911626, "loss": 0.7145, "step": 1700 }, { "epoch": 0.37, "learning_rate": 0.0002648170952553422, "loss": 0.7111, "step": 1720 }, { "epoch": 0.38, "learning_rate": 0.0002643824701195219, "loss": 0.7155, "step": 1740 }, { "epoch": 0.38, "learning_rate": 0.00026394784498370155, "loss": 0.718, "step": 1760 }, { "epoch": 0.38, "learning_rate": 0.00026351321984788116, "loss": 0.7125, "step": 1780 }, { "epoch": 0.39, "learning_rate": 0.0002630785947120608, "loss": 0.7163, "step": 1800 }, { "epoch": 0.39, "eval_loss": 0.7301950454711914, "eval_runtime": 49.9689, "eval_samples_per_second": 40.025, "eval_steps_per_second": 0.64, "step": 1800 }, { "epoch": 0.39, "learning_rate": 0.0002626439695762405, "loss": 0.7121, "step": 1820 }, { "epoch": 0.4, "learning_rate": 0.0002622093444404201, "loss": 0.7092, "step": 1840 }, { "epoch": 0.4, "learning_rate": 0.00026177471930459976, "loss": 0.7133, "step": 1860 }, { "epoch": 0.41, "learning_rate": 0.0002613400941687794, "loss": 0.7171, "step": 1880 }, { "epoch": 0.41, "learning_rate": 0.00026090546903295903, "loss": 0.7235, "step": 1900 }, { "epoch": 0.41, "learning_rate": 0.0002604708438971387, "loss": 0.7086, "step": 1920 }, { "epoch": 0.42, "learning_rate": 0.00026003621876131836, "loss": 0.7136, "step": 1940 }, { "epoch": 0.42, "learning_rate": 0.00025960159362549797, "loss": 0.7031, "step": 1960 }, { "epoch": 0.43, "learning_rate": 0.00025916696848967763, "loss": 0.7084, "step": 1980 }, { "epoch": 0.43, "learning_rate": 0.0002587323433538573, "loss": 0.7091, "step": 2000 }, { "epoch": 0.43, "eval_loss": 0.726446270942688, "eval_runtime": 50.0519, "eval_samples_per_second": 39.959, "eval_steps_per_second": 0.639, "step": 2000 }, { "epoch": 0.44, "learning_rate": 0.0002582977182180369, "loss": 0.7119, "step": 2020 }, { "epoch": 0.44, "learning_rate": 0.00025786309308221657, "loss": 0.7186, "step": 2040 }, { "epoch": 0.44, "learning_rate": 0.00025742846794639623, "loss": 0.703, "step": 2060 }, { "epoch": 0.45, "learning_rate": 0.00025699384281057584, "loss": 0.7078, "step": 2080 }, { "epoch": 0.45, "learning_rate": 0.0002565592176747555, "loss": 0.7084, "step": 2100 }, { "epoch": 0.46, "learning_rate": 0.00025612459253893517, "loss": 0.7014, "step": 2120 }, { "epoch": 0.46, "learning_rate": 0.0002556899674031148, "loss": 0.7076, "step": 2140 }, { "epoch": 0.47, "learning_rate": 0.00025525534226729444, "loss": 0.7103, "step": 2160 }, { "epoch": 0.47, "learning_rate": 0.0002548207171314741, "loss": 0.7118, "step": 2180 }, { "epoch": 0.47, "learning_rate": 0.0002543860919956537, "loss": 0.7028, "step": 2200 }, { "epoch": 0.47, "eval_loss": 0.7220268845558167, "eval_runtime": 49.9937, "eval_samples_per_second": 40.005, "eval_steps_per_second": 0.64, "step": 2200 }, { "epoch": 0.48, "learning_rate": 0.0002539514668598334, "loss": 0.707, "step": 2220 }, { "epoch": 0.48, "learning_rate": 0.00025351684172401304, "loss": 0.7045, "step": 2240 }, { "epoch": 0.49, "learning_rate": 0.00025308221658819265, "loss": 0.6905, "step": 2260 }, { "epoch": 0.49, "learning_rate": 0.0002526475914523723, "loss": 0.6982, "step": 2280 }, { "epoch": 0.5, "learning_rate": 0.000252212966316552, "loss": 0.706, "step": 2300 }, { "epoch": 0.5, "learning_rate": 0.0002517783411807316, "loss": 0.6992, "step": 2320 }, { "epoch": 0.5, "learning_rate": 0.00025134371604491125, "loss": 0.6939, "step": 2340 }, { "epoch": 0.51, "learning_rate": 0.00025090909090909086, "loss": 0.7037, "step": 2360 }, { "epoch": 0.51, "learning_rate": 0.0002504744657732705, "loss": 0.7127, "step": 2380 }, { "epoch": 0.52, "learning_rate": 0.00025003984063745014, "loss": 0.702, "step": 2400 }, { "epoch": 0.52, "eval_loss": 0.7191869020462036, "eval_runtime": 50.0038, "eval_samples_per_second": 39.997, "eval_steps_per_second": 0.64, "step": 2400 }, { "epoch": 0.52, "learning_rate": 0.0002496052155016298, "loss": 0.7033, "step": 2420 }, { "epoch": 0.53, "learning_rate": 0.00024917059036580946, "loss": 0.7028, "step": 2440 }, { "epoch": 0.53, "learning_rate": 0.00024873596522998907, "loss": 0.6967, "step": 2460 }, { "epoch": 0.54, "learning_rate": 0.00024830134009416874, "loss": 0.7068, "step": 2480 }, { "epoch": 0.54, "learning_rate": 0.0002478667149583484, "loss": 0.7105, "step": 2500 }, { "epoch": 0.54, "learning_rate": 0.000247432089822528, "loss": 0.6968, "step": 2520 }, { "epoch": 0.55, "learning_rate": 0.00024699746468670767, "loss": 0.7025, "step": 2540 }, { "epoch": 0.55, "learning_rate": 0.00024656283955088734, "loss": 0.6942, "step": 2560 }, { "epoch": 0.56, "learning_rate": 0.00024612821441506694, "loss": 0.6948, "step": 2580 }, { "epoch": 0.56, "learning_rate": 0.0002456935892792466, "loss": 0.6979, "step": 2600 }, { "epoch": 0.56, "eval_loss": 0.715853750705719, "eval_runtime": 50.0426, "eval_samples_per_second": 39.966, "eval_steps_per_second": 0.639, "step": 2600 }, { "epoch": 0.57, "learning_rate": 0.00024525896414342627, "loss": 0.6967, "step": 2620 }, { "epoch": 0.57, "learning_rate": 0.0002448243390076059, "loss": 0.7012, "step": 2640 }, { "epoch": 0.57, "learning_rate": 0.00024438971387178554, "loss": 0.697, "step": 2660 }, { "epoch": 0.58, "learning_rate": 0.0002439550887359652, "loss": 0.6931, "step": 2680 }, { "epoch": 0.58, "learning_rate": 0.00024352046360014485, "loss": 0.6856, "step": 2700 }, { "epoch": 0.59, "learning_rate": 0.00024308583846432448, "loss": 0.697, "step": 2720 }, { "epoch": 0.59, "learning_rate": 0.00024265121332850415, "loss": 0.6996, "step": 2740 }, { "epoch": 0.6, "learning_rate": 0.00024221658819268378, "loss": 0.698, "step": 2760 }, { "epoch": 0.6, "learning_rate": 0.00024178196305686342, "loss": 0.6952, "step": 2780 }, { "epoch": 0.6, "learning_rate": 0.00024134733792104308, "loss": 0.7049, "step": 2800 }, { "epoch": 0.6, "eval_loss": 0.7124837040901184, "eval_runtime": 50.0654, "eval_samples_per_second": 39.948, "eval_steps_per_second": 0.639, "step": 2800 }, { "epoch": 0.61, "learning_rate": 0.00024091271278522272, "loss": 0.6927, "step": 2820 }, { "epoch": 0.61, "learning_rate": 0.00024047808764940235, "loss": 0.6996, "step": 2840 }, { "epoch": 0.62, "learning_rate": 0.00024004346251358202, "loss": 0.6921, "step": 2860 }, { "epoch": 0.62, "learning_rate": 0.00023960883737776165, "loss": 0.695, "step": 2880 }, { "epoch": 0.63, "learning_rate": 0.0002391742122419413, "loss": 0.6887, "step": 2900 }, { "epoch": 0.63, "learning_rate": 0.00023873958710612095, "loss": 0.6915, "step": 2920 }, { "epoch": 0.63, "learning_rate": 0.0002383049619703006, "loss": 0.6915, "step": 2940 }, { "epoch": 0.64, "learning_rate": 0.00023787033683448023, "loss": 0.6916, "step": 2960 }, { "epoch": 0.64, "learning_rate": 0.0002374357116986599, "loss": 0.687, "step": 2980 }, { "epoch": 0.65, "learning_rate": 0.00023700108656283953, "loss": 0.6997, "step": 3000 }, { "epoch": 0.65, "eval_loss": 0.7098860144615173, "eval_runtime": 50.0652, "eval_samples_per_second": 39.948, "eval_steps_per_second": 0.639, "step": 3000 }, { "epoch": 0.65, "learning_rate": 0.00023656646142701916, "loss": 0.6895, "step": 3020 }, { "epoch": 0.66, "learning_rate": 0.00023613183629119883, "loss": 0.6861, "step": 3040 }, { "epoch": 0.66, "learning_rate": 0.00023569721115537846, "loss": 0.6988, "step": 3060 }, { "epoch": 0.66, "learning_rate": 0.0002352625860195581, "loss": 0.6852, "step": 3080 }, { "epoch": 0.67, "learning_rate": 0.00023482796088373776, "loss": 0.6863, "step": 3100 }, { "epoch": 0.67, "learning_rate": 0.0002343933357479174, "loss": 0.6943, "step": 3120 }, { "epoch": 0.68, "learning_rate": 0.00023395871061209704, "loss": 0.686, "step": 3140 }, { "epoch": 0.68, "learning_rate": 0.0002335240854762767, "loss": 0.684, "step": 3160 }, { "epoch": 0.69, "learning_rate": 0.00023308946034045634, "loss": 0.6866, "step": 3180 }, { "epoch": 0.69, "learning_rate": 0.00023265483520463597, "loss": 0.6859, "step": 3200 }, { "epoch": 0.69, "eval_loss": 0.7077216506004333, "eval_runtime": 50.0526, "eval_samples_per_second": 39.958, "eval_steps_per_second": 0.639, "step": 3200 }, { "epoch": 0.69, "learning_rate": 0.00023222021006881564, "loss": 0.6845, "step": 3220 }, { "epoch": 0.7, "learning_rate": 0.00023178558493299527, "loss": 0.7011, "step": 3240 }, { "epoch": 0.7, "learning_rate": 0.0002313509597971749, "loss": 0.69, "step": 3260 }, { "epoch": 0.71, "learning_rate": 0.00023091633466135457, "loss": 0.6931, "step": 3280 }, { "epoch": 0.71, "learning_rate": 0.0002304817095255342, "loss": 0.6998, "step": 3300 }, { "epoch": 0.72, "learning_rate": 0.00023004708438971385, "loss": 0.6933, "step": 3320 }, { "epoch": 0.72, "learning_rate": 0.0002296124592538935, "loss": 0.6859, "step": 3340 }, { "epoch": 0.72, "learning_rate": 0.00022917783411807315, "loss": 0.6972, "step": 3360 }, { "epoch": 0.73, "learning_rate": 0.00022874320898225278, "loss": 0.6868, "step": 3380 }, { "epoch": 0.73, "learning_rate": 0.00022830858384643245, "loss": 0.6902, "step": 3400 }, { "epoch": 0.73, "eval_loss": 0.7059928178787231, "eval_runtime": 50.0118, "eval_samples_per_second": 39.991, "eval_steps_per_second": 0.64, "step": 3400 }, { "epoch": 0.74, "learning_rate": 0.00022787395871061208, "loss": 0.6819, "step": 3420 }, { "epoch": 0.74, "learning_rate": 0.00022743933357479172, "loss": 0.6833, "step": 3440 }, { "epoch": 0.75, "learning_rate": 0.00022700470843897138, "loss": 0.6826, "step": 3460 }, { "epoch": 0.75, "learning_rate": 0.00022657008330315102, "loss": 0.694, "step": 3480 }, { "epoch": 0.76, "learning_rate": 0.00022613545816733066, "loss": 0.6827, "step": 3500 }, { "epoch": 0.76, "learning_rate": 0.00022570083303151032, "loss": 0.6844, "step": 3520 }, { "epoch": 0.76, "learning_rate": 0.00022526620789568996, "loss": 0.6893, "step": 3540 }, { "epoch": 0.77, "learning_rate": 0.0002248315827598696, "loss": 0.6843, "step": 3560 }, { "epoch": 0.77, "learning_rate": 0.00022439695762404926, "loss": 0.6843, "step": 3580 }, { "epoch": 0.78, "learning_rate": 0.0002239623324882289, "loss": 0.691, "step": 3600 }, { "epoch": 0.78, "eval_loss": 0.7041522264480591, "eval_runtime": 50.0554, "eval_samples_per_second": 39.956, "eval_steps_per_second": 0.639, "step": 3600 }, { "epoch": 0.78, "learning_rate": 0.00022352770735240853, "loss": 0.6846, "step": 3620 }, { "epoch": 0.79, "learning_rate": 0.0002230930822165882, "loss": 0.689, "step": 3640 }, { "epoch": 0.79, "learning_rate": 0.00022265845708076783, "loss": 0.6777, "step": 3660 }, { "epoch": 0.79, "learning_rate": 0.00022222383194494747, "loss": 0.6903, "step": 3680 }, { "epoch": 0.8, "learning_rate": 0.00022178920680912713, "loss": 0.684, "step": 3700 }, { "epoch": 0.8, "learning_rate": 0.00022135458167330677, "loss": 0.6867, "step": 3720 }, { "epoch": 0.81, "learning_rate": 0.0002209199565374864, "loss": 0.6697, "step": 3740 }, { "epoch": 0.81, "learning_rate": 0.00022048533140166607, "loss": 0.6864, "step": 3760 }, { "epoch": 0.82, "learning_rate": 0.0002200507062658457, "loss": 0.6813, "step": 3780 }, { "epoch": 0.82, "learning_rate": 0.00021961608113002534, "loss": 0.6807, "step": 3800 }, { "epoch": 0.82, "eval_loss": 0.7024796009063721, "eval_runtime": 50.022, "eval_samples_per_second": 39.982, "eval_steps_per_second": 0.64, "step": 3800 }, { "epoch": 0.82, "learning_rate": 0.000219181455994205, "loss": 0.6824, "step": 3820 }, { "epoch": 0.83, "learning_rate": 0.00021874683085838464, "loss": 0.6814, "step": 3840 }, { "epoch": 0.83, "learning_rate": 0.00021831220572256427, "loss": 0.6789, "step": 3860 }, { "epoch": 0.84, "learning_rate": 0.00021787758058674394, "loss": 0.6752, "step": 3880 }, { "epoch": 0.84, "learning_rate": 0.00021744295545092358, "loss": 0.6826, "step": 3900 }, { "epoch": 0.85, "learning_rate": 0.0002170083303151032, "loss": 0.6874, "step": 3920 }, { "epoch": 0.85, "learning_rate": 0.00021657370517928288, "loss": 0.6761, "step": 3940 }, { "epoch": 0.85, "learning_rate": 0.0002161390800434625, "loss": 0.6795, "step": 3960 }, { "epoch": 0.86, "learning_rate": 0.00021570445490764215, "loss": 0.6781, "step": 3980 }, { "epoch": 0.86, "learning_rate": 0.0002152698297718218, "loss": 0.6754, "step": 4000 }, { "epoch": 0.86, "eval_loss": 0.7004331350326538, "eval_runtime": 50.0568, "eval_samples_per_second": 39.955, "eval_steps_per_second": 0.639, "step": 4000 }, { "epoch": 0.87, "learning_rate": 0.00021483520463600145, "loss": 0.6791, "step": 4020 }, { "epoch": 0.87, "learning_rate": 0.00021440057950018108, "loss": 0.6863, "step": 4040 }, { "epoch": 0.88, "learning_rate": 0.00021396595436436075, "loss": 0.6846, "step": 4060 }, { "epoch": 0.88, "learning_rate": 0.00021353132922854036, "loss": 0.6814, "step": 4080 }, { "epoch": 0.88, "learning_rate": 0.00021309670409272, "loss": 0.6825, "step": 4100 }, { "epoch": 0.89, "learning_rate": 0.00021266207895689963, "loss": 0.6827, "step": 4120 }, { "epoch": 0.89, "learning_rate": 0.0002122274538210793, "loss": 0.6769, "step": 4140 }, { "epoch": 0.9, "learning_rate": 0.00021179282868525893, "loss": 0.6869, "step": 4160 }, { "epoch": 0.9, "learning_rate": 0.00021135820354943857, "loss": 0.6815, "step": 4180 }, { "epoch": 0.91, "learning_rate": 0.00021092357841361823, "loss": 0.6725, "step": 4200 }, { "epoch": 0.91, "eval_loss": 0.6981337666511536, "eval_runtime": 50.0559, "eval_samples_per_second": 39.955, "eval_steps_per_second": 0.639, "step": 4200 }, { "epoch": 0.91, "learning_rate": 0.00021051068453458889, "loss": 0.6731, "step": 4220 }, { "epoch": 0.91, "learning_rate": 0.00021007605939876855, "loss": 0.6792, "step": 4240 }, { "epoch": 0.92, "learning_rate": 0.00020964143426294819, "loss": 0.6755, "step": 4260 }, { "epoch": 0.92, "learning_rate": 0.00020920680912712782, "loss": 0.6833, "step": 4280 }, { "epoch": 0.93, "learning_rate": 0.0002087721839913075, "loss": 0.6693, "step": 4300 }, { "epoch": 0.93, "learning_rate": 0.00020833755885548712, "loss": 0.6728, "step": 4320 }, { "epoch": 0.94, "learning_rate": 0.00020790293371966676, "loss": 0.6812, "step": 4340 }, { "epoch": 0.94, "learning_rate": 0.00020746830858384642, "loss": 0.6734, "step": 4360 }, { "epoch": 0.94, "learning_rate": 0.00020703368344802606, "loss": 0.6813, "step": 4380 }, { "epoch": 0.95, "learning_rate": 0.0002065990583122057, "loss": 0.6779, "step": 4400 }, { "epoch": 0.95, "eval_loss": 0.6968498826026917, "eval_runtime": 50.0697, "eval_samples_per_second": 39.944, "eval_steps_per_second": 0.639, "step": 4400 }, { "epoch": 0.95, "learning_rate": 0.00020616443317638536, "loss": 0.6712, "step": 4420 }, { "epoch": 0.96, "learning_rate": 0.000205729808040565, "loss": 0.6846, "step": 4440 }, { "epoch": 0.96, "learning_rate": 0.00020529518290474463, "loss": 0.6694, "step": 4460 }, { "epoch": 0.97, "learning_rate": 0.0002048605577689243, "loss": 0.6753, "step": 4480 }, { "epoch": 0.97, "learning_rate": 0.00020442593263310393, "loss": 0.6792, "step": 4500 }, { "epoch": 0.98, "learning_rate": 0.00020399130749728357, "loss": 0.6738, "step": 4520 }, { "epoch": 0.98, "learning_rate": 0.00020355668236146323, "loss": 0.6699, "step": 4540 }, { "epoch": 0.98, "learning_rate": 0.00020312205722564287, "loss": 0.6737, "step": 4560 }, { "epoch": 0.99, "learning_rate": 0.0002026874320898225, "loss": 0.6837, "step": 4580 }, { "epoch": 0.99, "learning_rate": 0.00020225280695400217, "loss": 0.6701, "step": 4600 }, { "epoch": 0.99, "eval_loss": 0.6954157948493958, "eval_runtime": 50.0724, "eval_samples_per_second": 39.942, "eval_steps_per_second": 0.639, "step": 4600 }, { "epoch": 1.0, "learning_rate": 0.0002018181818181818, "loss": 0.6677, "step": 4620 }, { "epoch": 1.0, "learning_rate": 0.00020138355668236144, "loss": 0.6706, "step": 4640 }, { "epoch": 1.01, "learning_rate": 0.0002009489315465411, "loss": 0.6741, "step": 4660 }, { "epoch": 1.01, "learning_rate": 0.00020051430641072074, "loss": 0.6757, "step": 4680 }, { "epoch": 1.01, "learning_rate": 0.00020007968127490038, "loss": 0.6773, "step": 4700 }, { "epoch": 1.02, "learning_rate": 0.00019964505613908004, "loss": 0.6728, "step": 4720 }, { "epoch": 1.02, "learning_rate": 0.00019921043100325968, "loss": 0.6715, "step": 4740 }, { "epoch": 1.03, "learning_rate": 0.00019877580586743931, "loss": 0.6679, "step": 4760 }, { "epoch": 1.03, "learning_rate": 0.00019834118073161898, "loss": 0.6729, "step": 4780 }, { "epoch": 1.04, "learning_rate": 0.00019790655559579861, "loss": 0.6749, "step": 4800 }, { "epoch": 1.04, "eval_loss": 0.6941403746604919, "eval_runtime": 50.0645, "eval_samples_per_second": 39.948, "eval_steps_per_second": 0.639, "step": 4800 }, { "epoch": 1.04, "learning_rate": 0.00019747193045997825, "loss": 0.6661, "step": 4820 }, { "epoch": 1.04, "learning_rate": 0.0001970373053241579, "loss": 0.6638, "step": 4840 }, { "epoch": 1.05, "learning_rate": 0.00019660268018833755, "loss": 0.6715, "step": 4860 }, { "epoch": 1.05, "learning_rate": 0.0001961680550525172, "loss": 0.6721, "step": 4880 }, { "epoch": 1.06, "learning_rate": 0.00019573342991669682, "loss": 0.6695, "step": 4900 }, { "epoch": 1.06, "learning_rate": 0.0001952988047808765, "loss": 0.6809, "step": 4920 }, { "epoch": 1.07, "learning_rate": 0.00019486417964505612, "loss": 0.6701, "step": 4940 }, { "epoch": 1.07, "learning_rate": 0.00019442955450923576, "loss": 0.6747, "step": 4960 }, { "epoch": 1.07, "learning_rate": 0.00019399492937341542, "loss": 0.6713, "step": 4980 }, { "epoch": 1.08, "learning_rate": 0.00019356030423759506, "loss": 0.6746, "step": 5000 }, { "epoch": 1.08, "eval_loss": 0.6935788989067078, "eval_runtime": 50.0137, "eval_samples_per_second": 39.989, "eval_steps_per_second": 0.64, "step": 5000 }, { "epoch": 1.08, "learning_rate": 0.0001931256791017747, "loss": 0.672, "step": 5020 }, { "epoch": 1.09, "learning_rate": 0.00019269105396595436, "loss": 0.6673, "step": 5040 }, { "epoch": 1.09, "learning_rate": 0.000192256428830134, "loss": 0.6706, "step": 5060 }, { "epoch": 1.1, "learning_rate": 0.00019182180369431363, "loss": 0.6677, "step": 5080 }, { "epoch": 1.1, "learning_rate": 0.0001913871785584933, "loss": 0.67, "step": 5100 }, { "epoch": 1.1, "learning_rate": 0.00019095255342267293, "loss": 0.6693, "step": 5120 }, { "epoch": 1.11, "learning_rate": 0.00019051792828685257, "loss": 0.671, "step": 5140 }, { "epoch": 1.11, "learning_rate": 0.00019008330315103223, "loss": 0.6748, "step": 5160 }, { "epoch": 1.12, "learning_rate": 0.00018964867801521187, "loss": 0.6698, "step": 5180 }, { "epoch": 1.12, "learning_rate": 0.0001892140528793915, "loss": 0.662, "step": 5200 }, { "epoch": 1.12, "eval_loss": 0.6918168663978577, "eval_runtime": 50.0897, "eval_samples_per_second": 39.928, "eval_steps_per_second": 0.639, "step": 5200 }, { "epoch": 1.13, "learning_rate": 0.00018877942774357117, "loss": 0.66, "step": 5220 }, { "epoch": 1.13, "learning_rate": 0.0001883448026077508, "loss": 0.6705, "step": 5240 }, { "epoch": 1.13, "learning_rate": 0.00018791017747193044, "loss": 0.6693, "step": 5260 }, { "epoch": 1.14, "learning_rate": 0.0001874755523361101, "loss": 0.6546, "step": 5280 }, { "epoch": 1.14, "learning_rate": 0.00018704092720028974, "loss": 0.6673, "step": 5300 }, { "epoch": 1.15, "learning_rate": 0.00018660630206446938, "loss": 0.671, "step": 5320 }, { "epoch": 1.15, "learning_rate": 0.00018617167692864904, "loss": 0.675, "step": 5340 }, { "epoch": 1.16, "learning_rate": 0.00018573705179282868, "loss": 0.6744, "step": 5360 }, { "epoch": 1.16, "learning_rate": 0.00018530242665700832, "loss": 0.6643, "step": 5380 }, { "epoch": 1.17, "learning_rate": 0.00018486780152118798, "loss": 0.6686, "step": 5400 }, { "epoch": 1.17, "eval_loss": 0.6908227801322937, "eval_runtime": 50.0742, "eval_samples_per_second": 39.941, "eval_steps_per_second": 0.639, "step": 5400 }, { "epoch": 1.17, "learning_rate": 0.00018443317638536762, "loss": 0.6666, "step": 5420 }, { "epoch": 1.17, "learning_rate": 0.00018399855124954725, "loss": 0.6658, "step": 5440 }, { "epoch": 1.18, "learning_rate": 0.0001835639261137269, "loss": 0.671, "step": 5460 }, { "epoch": 1.18, "learning_rate": 0.00018312930097790653, "loss": 0.6736, "step": 5480 }, { "epoch": 1.19, "learning_rate": 0.00018269467584208616, "loss": 0.6697, "step": 5500 }, { "epoch": 1.19, "learning_rate": 0.00018226005070626583, "loss": 0.6718, "step": 5520 }, { "epoch": 1.2, "learning_rate": 0.00018182542557044546, "loss": 0.6701, "step": 5540 }, { "epoch": 1.2, "learning_rate": 0.0001813908004346251, "loss": 0.6696, "step": 5560 }, { "epoch": 1.2, "learning_rate": 0.00018095617529880476, "loss": 0.6611, "step": 5580 }, { "epoch": 1.21, "learning_rate": 0.0001805215501629844, "loss": 0.6638, "step": 5600 }, { "epoch": 1.21, "eval_loss": 0.689289927482605, "eval_runtime": 50.1304, "eval_samples_per_second": 39.896, "eval_steps_per_second": 0.638, "step": 5600 }, { "epoch": 1.21, "learning_rate": 0.00018008692502716404, "loss": 0.6646, "step": 5620 }, { "epoch": 1.22, "learning_rate": 0.0001796522998913437, "loss": 0.6717, "step": 5640 }, { "epoch": 1.22, "learning_rate": 0.00017921767475552334, "loss": 0.6647, "step": 5660 }, { "epoch": 1.23, "learning_rate": 0.00017878304961970297, "loss": 0.672, "step": 5680 }, { "epoch": 1.23, "learning_rate": 0.00017834842448388264, "loss": 0.6645, "step": 5700 }, { "epoch": 1.23, "learning_rate": 0.00017791379934806227, "loss": 0.6768, "step": 5720 }, { "epoch": 1.24, "learning_rate": 0.0001774791742122419, "loss": 0.6748, "step": 5740 }, { "epoch": 1.24, "learning_rate": 0.00017704454907642157, "loss": 0.6722, "step": 5760 }, { "epoch": 1.25, "learning_rate": 0.0001766099239406012, "loss": 0.6631, "step": 5780 }, { "epoch": 1.25, "learning_rate": 0.00017617529880478084, "loss": 0.6647, "step": 5800 }, { "epoch": 1.25, "eval_loss": 0.688850462436676, "eval_runtime": 50.0542, "eval_samples_per_second": 39.957, "eval_steps_per_second": 0.639, "step": 5800 }, { "epoch": 1.26, "learning_rate": 0.0001757406736689605, "loss": 0.66, "step": 5820 }, { "epoch": 1.26, "learning_rate": 0.00017530604853314014, "loss": 0.6682, "step": 5840 }, { "epoch": 1.26, "learning_rate": 0.00017487142339731978, "loss": 0.6589, "step": 5860 }, { "epoch": 1.27, "learning_rate": 0.00017443679826149944, "loss": 0.6691, "step": 5880 }, { "epoch": 1.27, "learning_rate": 0.00017400217312567908, "loss": 0.6726, "step": 5900 }, { "epoch": 1.28, "learning_rate": 0.00017356754798985872, "loss": 0.6628, "step": 5920 }, { "epoch": 1.28, "learning_rate": 0.00017313292285403838, "loss": 0.6719, "step": 5940 }, { "epoch": 1.29, "learning_rate": 0.00017269829771821802, "loss": 0.6648, "step": 5960 }, { "epoch": 1.29, "learning_rate": 0.00017226367258239765, "loss": 0.6594, "step": 5980 }, { "epoch": 1.29, "learning_rate": 0.00017182904744657732, "loss": 0.6717, "step": 6000 }, { "epoch": 1.29, "eval_loss": 0.6876093745231628, "eval_runtime": 50.1763, "eval_samples_per_second": 39.859, "eval_steps_per_second": 0.638, "step": 6000 }, { "epoch": 1.3, "learning_rate": 0.00017139442231075695, "loss": 0.6632, "step": 6020 }, { "epoch": 1.3, "learning_rate": 0.0001709597971749366, "loss": 0.6619, "step": 6040 }, { "epoch": 1.31, "learning_rate": 0.00017052517203911625, "loss": 0.667, "step": 6060 }, { "epoch": 1.31, "learning_rate": 0.0001700905469032959, "loss": 0.6625, "step": 6080 }, { "epoch": 1.32, "learning_rate": 0.00016965592176747553, "loss": 0.6661, "step": 6100 }, { "epoch": 1.32, "learning_rate": 0.0001692212966316552, "loss": 0.656, "step": 6120 }, { "epoch": 1.32, "learning_rate": 0.00016878667149583483, "loss": 0.6668, "step": 6140 }, { "epoch": 1.33, "learning_rate": 0.00016835204636001446, "loss": 0.6669, "step": 6160 }, { "epoch": 1.33, "learning_rate": 0.00016791742122419413, "loss": 0.6662, "step": 6180 }, { "epoch": 1.34, "learning_rate": 0.00016748279608837376, "loss": 0.6692, "step": 6200 }, { "epoch": 1.34, "eval_loss": 0.6869744658470154, "eval_runtime": 50.1517, "eval_samples_per_second": 39.879, "eval_steps_per_second": 0.638, "step": 6200 }, { "epoch": 1.34, "learning_rate": 0.0001670481709525534, "loss": 0.6571, "step": 6220 }, { "epoch": 1.35, "learning_rate": 0.00016661354581673306, "loss": 0.6659, "step": 6240 }, { "epoch": 1.35, "learning_rate": 0.0001661789206809127, "loss": 0.6622, "step": 6260 }, { "epoch": 1.35, "learning_rate": 0.00016574429554509234, "loss": 0.6522, "step": 6280 }, { "epoch": 1.36, "learning_rate": 0.000165309670409272, "loss": 0.667, "step": 6300 }, { "epoch": 1.36, "learning_rate": 0.00016487504527345164, "loss": 0.6644, "step": 6320 }, { "epoch": 1.37, "learning_rate": 0.00016444042013763127, "loss": 0.6625, "step": 6340 }, { "epoch": 1.37, "learning_rate": 0.00016400579500181094, "loss": 0.6686, "step": 6360 }, { "epoch": 1.38, "learning_rate": 0.00016357116986599057, "loss": 0.6562, "step": 6380 }, { "epoch": 1.38, "learning_rate": 0.0001631365447301702, "loss": 0.6595, "step": 6400 }, { "epoch": 1.38, "eval_loss": 0.685205340385437, "eval_runtime": 50.162, "eval_samples_per_second": 39.871, "eval_steps_per_second": 0.638, "step": 6400 }, { "epoch": 1.39, "learning_rate": 0.00016270191959434987, "loss": 0.6595, "step": 6420 }, { "epoch": 1.39, "learning_rate": 0.0001622672944585295, "loss": 0.6644, "step": 6440 }, { "epoch": 1.39, "learning_rate": 0.00016183266932270915, "loss": 0.6647, "step": 6460 }, { "epoch": 1.4, "learning_rate": 0.0001613980441868888, "loss": 0.6655, "step": 6480 }, { "epoch": 1.4, "learning_rate": 0.00016096341905106845, "loss": 0.6564, "step": 6500 }, { "epoch": 1.41, "learning_rate": 0.00016052879391524808, "loss": 0.6578, "step": 6520 }, { "epoch": 1.41, "learning_rate": 0.00016009416877942775, "loss": 0.6624, "step": 6540 }, { "epoch": 1.42, "learning_rate": 0.00015965954364360738, "loss": 0.6633, "step": 6560 }, { "epoch": 1.42, "learning_rate": 0.00015922491850778702, "loss": 0.6616, "step": 6580 }, { "epoch": 1.42, "learning_rate": 0.00015879029337196668, "loss": 0.6607, "step": 6600 }, { "epoch": 1.42, "eval_loss": 0.6847727298736572, "eval_runtime": 50.1562, "eval_samples_per_second": 39.875, "eval_steps_per_second": 0.638, "step": 6600 }, { "epoch": 1.43, "learning_rate": 0.00015835566823614632, "loss": 0.6564, "step": 6620 }, { "epoch": 1.43, "learning_rate": 0.00015792104310032596, "loss": 0.66, "step": 6640 }, { "epoch": 1.44, "learning_rate": 0.00015748641796450562, "loss": 0.6589, "step": 6660 }, { "epoch": 1.44, "learning_rate": 0.00015705179282868526, "loss": 0.6596, "step": 6680 }, { "epoch": 1.45, "learning_rate": 0.0001566171676928649, "loss": 0.6663, "step": 6700 }, { "epoch": 1.45, "learning_rate": 0.00015618254255704456, "loss": 0.6603, "step": 6720 }, { "epoch": 1.45, "learning_rate": 0.0001557479174212242, "loss": 0.6674, "step": 6740 }, { "epoch": 1.46, "learning_rate": 0.00015531329228540383, "loss": 0.6603, "step": 6760 }, { "epoch": 1.46, "learning_rate": 0.0001548786671495835, "loss": 0.6612, "step": 6780 }, { "epoch": 1.47, "learning_rate": 0.00015444404201376313, "loss": 0.6609, "step": 6800 }, { "epoch": 1.47, "eval_loss": 0.683903694152832, "eval_runtime": 50.079, "eval_samples_per_second": 39.937, "eval_steps_per_second": 0.639, "step": 6800 }, { "epoch": 1.47, "learning_rate": 0.00015400941687794277, "loss": 0.6557, "step": 6820 }, { "epoch": 1.48, "learning_rate": 0.00015357479174212243, "loss": 0.6627, "step": 6840 }, { "epoch": 1.48, "learning_rate": 0.00015314016660630207, "loss": 0.6667, "step": 6860 }, { "epoch": 1.48, "learning_rate": 0.0001527055414704817, "loss": 0.6633, "step": 6880 }, { "epoch": 1.49, "learning_rate": 0.00015227091633466137, "loss": 0.6565, "step": 6900 }, { "epoch": 1.49, "learning_rate": 0.000151836291198841, "loss": 0.6588, "step": 6920 }, { "epoch": 1.5, "learning_rate": 0.00015140166606302064, "loss": 0.6687, "step": 6940 }, { "epoch": 1.5, "learning_rate": 0.0001509670409272003, "loss": 0.6611, "step": 6960 }, { "epoch": 1.51, "learning_rate": 0.00015053241579137994, "loss": 0.6576, "step": 6980 }, { "epoch": 1.51, "learning_rate": 0.00015009779065555957, "loss": 0.6576, "step": 7000 }, { "epoch": 1.51, "eval_loss": 0.6830142736434937, "eval_runtime": 50.1233, "eval_samples_per_second": 39.902, "eval_steps_per_second": 0.638, "step": 7000 }, { "epoch": 1.51, "learning_rate": 0.0001496631655197392, "loss": 0.6617, "step": 7020 }, { "epoch": 1.52, "learning_rate": 0.00014922854038391885, "loss": 0.6533, "step": 7040 }, { "epoch": 1.52, "learning_rate": 0.0001487939152480985, "loss": 0.6524, "step": 7060 }, { "epoch": 1.53, "learning_rate": 0.00014835929011227815, "loss": 0.6597, "step": 7080 }, { "epoch": 1.53, "learning_rate": 0.00014792466497645778, "loss": 0.656, "step": 7100 }, { "epoch": 1.54, "learning_rate": 0.00014749003984063745, "loss": 0.6501, "step": 7120 }, { "epoch": 1.54, "learning_rate": 0.00014705541470481708, "loss": 0.6563, "step": 7140 }, { "epoch": 1.54, "learning_rate": 0.00014662078956899672, "loss": 0.6496, "step": 7160 }, { "epoch": 1.55, "learning_rate": 0.00014618616443317638, "loss": 0.6602, "step": 7180 }, { "epoch": 1.55, "learning_rate": 0.00014575153929735602, "loss": 0.6617, "step": 7200 }, { "epoch": 1.55, "eval_loss": 0.6818540096282959, "eval_runtime": 50.1175, "eval_samples_per_second": 39.906, "eval_steps_per_second": 0.639, "step": 7200 }, { "epoch": 1.56, "learning_rate": 0.00014531691416153566, "loss": 0.6655, "step": 7220 }, { "epoch": 1.56, "learning_rate": 0.00014488228902571532, "loss": 0.6544, "step": 7240 }, { "epoch": 1.57, "learning_rate": 0.00014444766388989496, "loss": 0.655, "step": 7260 }, { "epoch": 1.57, "learning_rate": 0.0001440130387540746, "loss": 0.6535, "step": 7280 }, { "epoch": 1.57, "learning_rate": 0.00014357841361825426, "loss": 0.6584, "step": 7300 }, { "epoch": 1.58, "learning_rate": 0.0001431437884824339, "loss": 0.6602, "step": 7320 }, { "epoch": 1.58, "learning_rate": 0.00014270916334661353, "loss": 0.6689, "step": 7340 }, { "epoch": 1.59, "learning_rate": 0.0001422745382107932, "loss": 0.6613, "step": 7360 }, { "epoch": 1.59, "learning_rate": 0.00014183991307497283, "loss": 0.659, "step": 7380 }, { "epoch": 1.6, "learning_rate": 0.00014140528793915247, "loss": 0.6463, "step": 7400 }, { "epoch": 1.6, "eval_loss": 0.681868851184845, "eval_runtime": 50.1388, "eval_samples_per_second": 39.889, "eval_steps_per_second": 0.638, "step": 7400 }, { "epoch": 1.6, "learning_rate": 0.00014097066280333213, "loss": 0.6617, "step": 7420 }, { "epoch": 1.61, "learning_rate": 0.00014053603766751177, "loss": 0.6648, "step": 7440 }, { "epoch": 1.61, "learning_rate": 0.0001401014125316914, "loss": 0.6528, "step": 7460 }, { "epoch": 1.61, "learning_rate": 0.00013966678739587107, "loss": 0.6655, "step": 7480 }, { "epoch": 1.62, "learning_rate": 0.0001392321622600507, "loss": 0.6609, "step": 7500 }, { "epoch": 1.62, "learning_rate": 0.00013879753712423034, "loss": 0.6528, "step": 7520 }, { "epoch": 1.63, "learning_rate": 0.00013836291198841, "loss": 0.6561, "step": 7540 }, { "epoch": 1.63, "learning_rate": 0.00013792828685258964, "loss": 0.6682, "step": 7560 }, { "epoch": 1.64, "learning_rate": 0.00013749366171676928, "loss": 0.6677, "step": 7580 }, { "epoch": 1.64, "learning_rate": 0.00013705903658094894, "loss": 0.6599, "step": 7600 }, { "epoch": 1.64, "eval_loss": 0.6807426810264587, "eval_runtime": 50.3308, "eval_samples_per_second": 39.737, "eval_steps_per_second": 0.636, "step": 7600 }, { "epoch": 1.64, "learning_rate": 0.00013662441144512855, "loss": 0.6525, "step": 7620 }, { "epoch": 1.65, "learning_rate": 0.0001361897863093082, "loss": 0.6574, "step": 7640 }, { "epoch": 1.65, "learning_rate": 0.00013575516117348785, "loss": 0.6516, "step": 7660 }, { "epoch": 1.66, "learning_rate": 0.00013532053603766749, "loss": 0.6533, "step": 7680 }, { "epoch": 1.66, "learning_rate": 0.00013488591090184715, "loss": 0.6577, "step": 7700 }, { "epoch": 1.67, "learning_rate": 0.00013445128576602679, "loss": 0.6592, "step": 7720 }, { "epoch": 1.67, "learning_rate": 0.00013401666063020642, "loss": 0.6585, "step": 7740 }, { "epoch": 1.67, "learning_rate": 0.00013358203549438609, "loss": 0.6607, "step": 7760 }, { "epoch": 1.68, "learning_rate": 0.00013314741035856572, "loss": 0.6617, "step": 7780 }, { "epoch": 1.68, "learning_rate": 0.00013271278522274536, "loss": 0.6443, "step": 7800 }, { "epoch": 1.68, "eval_loss": 0.6800745725631714, "eval_runtime": 50.165, "eval_samples_per_second": 39.868, "eval_steps_per_second": 0.638, "step": 7800 }, { "epoch": 1.69, "learning_rate": 0.00013227816008692502, "loss": 0.6587, "step": 7820 }, { "epoch": 1.69, "learning_rate": 0.00013184353495110466, "loss": 0.6613, "step": 7840 }, { "epoch": 1.7, "learning_rate": 0.0001314089098152843, "loss": 0.654, "step": 7860 }, { "epoch": 1.7, "learning_rate": 0.00013097428467946396, "loss": 0.6523, "step": 7880 }, { "epoch": 1.7, "learning_rate": 0.0001305396595436436, "loss": 0.6563, "step": 7900 }, { "epoch": 1.71, "learning_rate": 0.00013010503440782323, "loss": 0.6524, "step": 7920 }, { "epoch": 1.71, "learning_rate": 0.0001296704092720029, "loss": 0.6523, "step": 7940 }, { "epoch": 1.72, "learning_rate": 0.00012923578413618253, "loss": 0.6493, "step": 7960 }, { "epoch": 1.72, "learning_rate": 0.00012880115900036217, "loss": 0.6538, "step": 7980 }, { "epoch": 1.73, "learning_rate": 0.00012836653386454183, "loss": 0.6512, "step": 8000 }, { "epoch": 1.73, "eval_loss": 0.6790341734886169, "eval_runtime": 50.1317, "eval_samples_per_second": 39.895, "eval_steps_per_second": 0.638, "step": 8000 }, { "epoch": 1.73, "learning_rate": 0.00012793190872872147, "loss": 0.6562, "step": 8020 }, { "epoch": 1.73, "learning_rate": 0.0001274972835929011, "loss": 0.6556, "step": 8040 }, { "epoch": 1.74, "learning_rate": 0.00012706265845708077, "loss": 0.65, "step": 8060 }, { "epoch": 1.74, "learning_rate": 0.0001266280333212604, "loss": 0.661, "step": 8080 }, { "epoch": 1.75, "learning_rate": 0.00012619340818544004, "loss": 0.655, "step": 8100 }, { "epoch": 1.75, "learning_rate": 0.0001257587830496197, "loss": 0.6534, "step": 8120 }, { "epoch": 1.76, "learning_rate": 0.00012532415791379934, "loss": 0.6517, "step": 8140 }, { "epoch": 1.76, "learning_rate": 0.00012488953277797898, "loss": 0.6605, "step": 8160 }, { "epoch": 1.76, "learning_rate": 0.00012445490764215864, "loss": 0.6556, "step": 8180 }, { "epoch": 1.77, "learning_rate": 0.00012402028250633828, "loss": 0.6492, "step": 8200 }, { "epoch": 1.77, "eval_loss": 0.6781870126724243, "eval_runtime": 50.0809, "eval_samples_per_second": 39.935, "eval_steps_per_second": 0.639, "step": 8200 }, { "epoch": 1.77, "learning_rate": 0.00012358565737051791, "loss": 0.6541, "step": 8220 }, { "epoch": 1.78, "learning_rate": 0.00012315103223469758, "loss": 0.6517, "step": 8240 }, { "epoch": 1.78, "learning_rate": 0.00012271640709887721, "loss": 0.6483, "step": 8260 }, { "epoch": 1.79, "learning_rate": 0.00012228178196305685, "loss": 0.6619, "step": 8280 }, { "epoch": 1.79, "learning_rate": 0.0001218471568272365, "loss": 0.6556, "step": 8300 }, { "epoch": 1.8, "learning_rate": 0.00012141253169141615, "loss": 0.6471, "step": 8320 }, { "epoch": 1.8, "learning_rate": 0.00012097790655559579, "loss": 0.6611, "step": 8340 }, { "epoch": 1.8, "learning_rate": 0.00012054328141977544, "loss": 0.6506, "step": 8360 }, { "epoch": 1.81, "learning_rate": 0.00012010865628395509, "loss": 0.6611, "step": 8380 }, { "epoch": 1.81, "learning_rate": 0.00011967403114813472, "loss": 0.6557, "step": 8400 }, { "epoch": 1.81, "eval_loss": 0.6776989102363586, "eval_runtime": 50.1344, "eval_samples_per_second": 39.893, "eval_steps_per_second": 0.638, "step": 8400 }, { "epoch": 1.82, "learning_rate": 0.00011923940601231437, "loss": 0.6504, "step": 8420 }, { "epoch": 1.82, "learning_rate": 0.00011880478087649402, "loss": 0.6552, "step": 8440 }, { "epoch": 1.83, "learning_rate": 0.00011839188699746468, "loss": 0.641, "step": 8460 }, { "epoch": 1.83, "learning_rate": 0.00011795726186164432, "loss": 0.6535, "step": 8480 }, { "epoch": 1.83, "learning_rate": 0.00011752263672582397, "loss": 0.6568, "step": 8500 }, { "epoch": 1.84, "learning_rate": 0.00011708801159000362, "loss": 0.6621, "step": 8520 }, { "epoch": 1.84, "learning_rate": 0.00011665338645418325, "loss": 0.6607, "step": 8540 }, { "epoch": 1.85, "learning_rate": 0.0001162187613183629, "loss": 0.6516, "step": 8560 }, { "epoch": 1.85, "learning_rate": 0.00011578413618254255, "loss": 0.6497, "step": 8580 }, { "epoch": 1.86, "learning_rate": 0.00011534951104672219, "loss": 0.6559, "step": 8600 }, { "epoch": 1.86, "eval_loss": 0.6773191094398499, "eval_runtime": 50.1605, "eval_samples_per_second": 39.872, "eval_steps_per_second": 0.638, "step": 8600 }, { "epoch": 1.86, "learning_rate": 0.00011491488591090184, "loss": 0.6595, "step": 8620 }, { "epoch": 1.86, "learning_rate": 0.00011448026077508149, "loss": 0.6495, "step": 8640 }, { "epoch": 1.87, "learning_rate": 0.00011404563563926113, "loss": 0.6518, "step": 8660 }, { "epoch": 1.87, "learning_rate": 0.00011361101050344078, "loss": 0.6511, "step": 8680 }, { "epoch": 1.88, "learning_rate": 0.00011317638536762043, "loss": 0.6495, "step": 8700 }, { "epoch": 1.88, "learning_rate": 0.00011274176023180006, "loss": 0.6485, "step": 8720 }, { "epoch": 1.89, "learning_rate": 0.00011230713509597971, "loss": 0.6543, "step": 8740 }, { "epoch": 1.89, "learning_rate": 0.00011187250996015936, "loss": 0.6509, "step": 8760 }, { "epoch": 1.89, "learning_rate": 0.000111437884824339, "loss": 0.656, "step": 8780 }, { "epoch": 1.9, "learning_rate": 0.00011100325968851865, "loss": 0.6557, "step": 8800 }, { "epoch": 1.9, "eval_loss": 0.6773696541786194, "eval_runtime": 50.1296, "eval_samples_per_second": 39.897, "eval_steps_per_second": 0.638, "step": 8800 }, { "epoch": 1.9, "learning_rate": 0.0001105686345526983, "loss": 0.6509, "step": 8820 }, { "epoch": 1.91, "learning_rate": 0.00011013400941687794, "loss": 0.65, "step": 8840 }, { "epoch": 1.91, "learning_rate": 0.00010969938428105759, "loss": 0.6447, "step": 8860 }, { "epoch": 1.92, "learning_rate": 0.00010926475914523724, "loss": 0.6563, "step": 8880 }, { "epoch": 1.92, "learning_rate": 0.00010883013400941687, "loss": 0.6545, "step": 8900 }, { "epoch": 1.92, "learning_rate": 0.00010839550887359652, "loss": 0.6509, "step": 8920 }, { "epoch": 1.93, "learning_rate": 0.00010796088373777617, "loss": 0.6434, "step": 8940 }, { "epoch": 1.93, "learning_rate": 0.00010752625860195581, "loss": 0.6412, "step": 8960 }, { "epoch": 1.94, "learning_rate": 0.00010709163346613546, "loss": 0.6512, "step": 8980 }, { "epoch": 1.94, "learning_rate": 0.00010665700833031508, "loss": 0.6478, "step": 9000 }, { "epoch": 1.94, "eval_loss": 0.6760911345481873, "eval_runtime": 50.1795, "eval_samples_per_second": 39.857, "eval_steps_per_second": 0.638, "step": 9000 }, { "epoch": 1.95, "learning_rate": 0.00010622238319449473, "loss": 0.6545, "step": 9020 }, { "epoch": 1.95, "learning_rate": 0.00010578775805867438, "loss": 0.6468, "step": 9040 }, { "epoch": 1.95, "learning_rate": 0.00010535313292285402, "loss": 0.6527, "step": 9060 }, { "epoch": 1.96, "learning_rate": 0.00010491850778703367, "loss": 0.6621, "step": 9080 }, { "epoch": 1.96, "learning_rate": 0.00010448388265121332, "loss": 0.6496, "step": 9100 }, { "epoch": 1.97, "learning_rate": 0.00010404925751539295, "loss": 0.6512, "step": 9120 }, { "epoch": 1.97, "learning_rate": 0.0001036146323795726, "loss": 0.6491, "step": 9140 }, { "epoch": 1.98, "learning_rate": 0.00010318000724375225, "loss": 0.6482, "step": 9160 }, { "epoch": 1.98, "learning_rate": 0.00010274538210793189, "loss": 0.6456, "step": 9180 }, { "epoch": 1.98, "learning_rate": 0.00010231075697211154, "loss": 0.6458, "step": 9200 }, { "epoch": 1.98, "eval_loss": 0.6748936772346497, "eval_runtime": 50.1856, "eval_samples_per_second": 39.852, "eval_steps_per_second": 0.638, "step": 9200 }, { "epoch": 1.99, "learning_rate": 0.00010187613183629119, "loss": 0.6473, "step": 9220 }, { "epoch": 1.99, "learning_rate": 0.00010144150670047083, "loss": 0.6496, "step": 9240 }, { "epoch": 2.0, "learning_rate": 0.00010100688156465048, "loss": 0.6566, "step": 9260 }, { "epoch": 2.0, "learning_rate": 0.00010057225642883013, "loss": 0.6475, "step": 9280 }, { "epoch": 2.01, "learning_rate": 0.00010013763129300976, "loss": 0.6536, "step": 9300 }, { "epoch": 2.01, "learning_rate": 9.970300615718941e-05, "loss": 0.646, "step": 9320 }, { "epoch": 2.02, "learning_rate": 9.926838102136906e-05, "loss": 0.6503, "step": 9340 }, { "epoch": 2.02, "learning_rate": 9.88337558855487e-05, "loss": 0.6527, "step": 9360 }, { "epoch": 2.02, "learning_rate": 9.839913074972835e-05, "loss": 0.6514, "step": 9380 }, { "epoch": 2.03, "learning_rate": 9.7964505613908e-05, "loss": 0.6548, "step": 9400 }, { "epoch": 2.03, "eval_loss": 0.6744834780693054, "eval_runtime": 50.1696, "eval_samples_per_second": 39.865, "eval_steps_per_second": 0.638, "step": 9400 }, { "epoch": 2.03, "learning_rate": 9.752988047808764e-05, "loss": 0.6483, "step": 9420 }, { "epoch": 2.04, "learning_rate": 9.709525534226729e-05, "loss": 0.6522, "step": 9440 }, { "epoch": 2.04, "learning_rate": 9.666063020644694e-05, "loss": 0.6538, "step": 9460 }, { "epoch": 2.05, "learning_rate": 9.622600507062657e-05, "loss": 0.6449, "step": 9480 }, { "epoch": 2.05, "learning_rate": 9.579137993480622e-05, "loss": 0.6451, "step": 9500 }, { "epoch": 2.05, "learning_rate": 9.535675479898587e-05, "loss": 0.6355, "step": 9520 }, { "epoch": 2.06, "learning_rate": 9.492212966316551e-05, "loss": 0.6494, "step": 9540 }, { "epoch": 2.06, "learning_rate": 9.448750452734516e-05, "loss": 0.6435, "step": 9560 }, { "epoch": 2.07, "learning_rate": 9.405287939152481e-05, "loss": 0.651, "step": 9580 }, { "epoch": 2.07, "learning_rate": 9.361825425570445e-05, "loss": 0.6493, "step": 9600 }, { "epoch": 2.07, "eval_loss": 0.674017071723938, "eval_runtime": 50.1402, "eval_samples_per_second": 39.888, "eval_steps_per_second": 0.638, "step": 9600 }, { "epoch": 2.08, "learning_rate": 9.31836291198841e-05, "loss": 0.6469, "step": 9620 }, { "epoch": 2.08, "learning_rate": 9.274900398406375e-05, "loss": 0.65, "step": 9640 }, { "epoch": 2.08, "learning_rate": 9.231437884824338e-05, "loss": 0.6536, "step": 9660 }, { "epoch": 2.09, "learning_rate": 9.187975371242303e-05, "loss": 0.6488, "step": 9680 }, { "epoch": 2.09, "learning_rate": 9.144512857660268e-05, "loss": 0.6391, "step": 9700 }, { "epoch": 2.1, "learning_rate": 9.101050344078232e-05, "loss": 0.644, "step": 9720 }, { "epoch": 2.1, "learning_rate": 9.057587830496197e-05, "loss": 0.6507, "step": 9740 }, { "epoch": 2.11, "learning_rate": 9.014125316914162e-05, "loss": 0.6404, "step": 9760 }, { "epoch": 2.11, "learning_rate": 8.970662803332126e-05, "loss": 0.6509, "step": 9780 }, { "epoch": 2.11, "learning_rate": 8.92720028975009e-05, "loss": 0.6435, "step": 9800 }, { "epoch": 2.11, "eval_loss": 0.6735255122184753, "eval_runtime": 50.1703, "eval_samples_per_second": 39.864, "eval_steps_per_second": 0.638, "step": 9800 }, { "epoch": 2.12, "learning_rate": 8.883737776168056e-05, "loss": 0.6374, "step": 9820 }, { "epoch": 2.12, "learning_rate": 8.840275262586019e-05, "loss": 0.6445, "step": 9840 }, { "epoch": 2.13, "learning_rate": 8.796812749003983e-05, "loss": 0.6495, "step": 9860 }, { "epoch": 2.13, "learning_rate": 8.753350235421946e-05, "loss": 0.6482, "step": 9880 }, { "epoch": 2.14, "learning_rate": 8.709887721839911e-05, "loss": 0.6441, "step": 9900 }, { "epoch": 2.14, "learning_rate": 8.666425208257877e-05, "loss": 0.6525, "step": 9920 }, { "epoch": 2.14, "learning_rate": 8.62296269467584e-05, "loss": 0.6453, "step": 9940 }, { "epoch": 2.15, "learning_rate": 8.579500181093805e-05, "loss": 0.6498, "step": 9960 }, { "epoch": 2.15, "learning_rate": 8.53603766751177e-05, "loss": 0.6471, "step": 9980 }, { "epoch": 2.16, "learning_rate": 8.492575153929734e-05, "loss": 0.6419, "step": 10000 }, { "epoch": 2.16, "eval_loss": 0.6730753779411316, "eval_runtime": 50.1885, "eval_samples_per_second": 39.85, "eval_steps_per_second": 0.638, "step": 10000 }, { "epoch": 2.16, "learning_rate": 8.449112640347699e-05, "loss": 0.6447, "step": 10020 }, { "epoch": 2.17, "learning_rate": 8.405650126765664e-05, "loss": 0.6444, "step": 10040 }, { "epoch": 2.17, "learning_rate": 8.362187613183627e-05, "loss": 0.6393, "step": 10060 }, { "epoch": 2.17, "learning_rate": 8.318725099601592e-05, "loss": 0.6464, "step": 10080 }, { "epoch": 2.18, "learning_rate": 8.275262586019557e-05, "loss": 0.6458, "step": 10100 }, { "epoch": 2.18, "learning_rate": 8.231800072437521e-05, "loss": 0.6402, "step": 10120 }, { "epoch": 2.19, "learning_rate": 8.188337558855486e-05, "loss": 0.6409, "step": 10140 }, { "epoch": 2.19, "learning_rate": 8.144875045273451e-05, "loss": 0.6512, "step": 10160 }, { "epoch": 2.2, "learning_rate": 8.101412531691415e-05, "loss": 0.6498, "step": 10180 }, { "epoch": 2.2, "learning_rate": 8.05795001810938e-05, "loss": 0.6393, "step": 10200 }, { "epoch": 2.2, "eval_loss": 0.6726437211036682, "eval_runtime": 50.1492, "eval_samples_per_second": 39.881, "eval_steps_per_second": 0.638, "step": 10200 }, { "epoch": 2.2, "learning_rate": 8.014487504527345e-05, "loss": 0.6458, "step": 10220 }, { "epoch": 2.21, "learning_rate": 7.971024990945308e-05, "loss": 0.6466, "step": 10240 }, { "epoch": 2.21, "learning_rate": 7.927562477363273e-05, "loss": 0.644, "step": 10260 }, { "epoch": 2.22, "learning_rate": 7.884099963781238e-05, "loss": 0.6467, "step": 10280 }, { "epoch": 2.22, "learning_rate": 7.840637450199202e-05, "loss": 0.6436, "step": 10300 }, { "epoch": 2.23, "learning_rate": 7.797174936617167e-05, "loss": 0.6422, "step": 10320 }, { "epoch": 2.23, "learning_rate": 7.753712423035132e-05, "loss": 0.645, "step": 10340 }, { "epoch": 2.24, "learning_rate": 7.710249909453096e-05, "loss": 0.6423, "step": 10360 }, { "epoch": 2.24, "learning_rate": 7.666787395871061e-05, "loss": 0.6557, "step": 10380 }, { "epoch": 2.24, "learning_rate": 7.623324882289026e-05, "loss": 0.646, "step": 10400 }, { "epoch": 2.24, "eval_loss": 0.6725419759750366, "eval_runtime": 50.1975, "eval_samples_per_second": 39.843, "eval_steps_per_second": 0.637, "step": 10400 }, { "epoch": 2.25, "learning_rate": 7.57986236870699e-05, "loss": 0.6503, "step": 10420 }, { "epoch": 2.25, "learning_rate": 7.536399855124954e-05, "loss": 0.6428, "step": 10440 }, { "epoch": 2.26, "learning_rate": 7.49293734154292e-05, "loss": 0.6438, "step": 10460 }, { "epoch": 2.26, "learning_rate": 7.449474827960883e-05, "loss": 0.6427, "step": 10480 }, { "epoch": 2.27, "learning_rate": 7.406012314378847e-05, "loss": 0.6458, "step": 10500 }, { "epoch": 2.27, "learning_rate": 7.362549800796812e-05, "loss": 0.6423, "step": 10520 }, { "epoch": 2.27, "learning_rate": 7.319087287214777e-05, "loss": 0.6466, "step": 10540 }, { "epoch": 2.28, "learning_rate": 7.27562477363274e-05, "loss": 0.6394, "step": 10560 }, { "epoch": 2.28, "learning_rate": 7.232162260050705e-05, "loss": 0.6362, "step": 10580 }, { "epoch": 2.29, "learning_rate": 7.18869974646867e-05, "loss": 0.6399, "step": 10600 }, { "epoch": 2.29, "eval_loss": 0.6719211935997009, "eval_runtime": 50.1808, "eval_samples_per_second": 39.856, "eval_steps_per_second": 0.638, "step": 10600 }, { "epoch": 2.29, "learning_rate": 7.145237232886634e-05, "loss": 0.6378, "step": 10620 }, { "epoch": 2.3, "learning_rate": 7.101774719304599e-05, "loss": 0.634, "step": 10640 }, { "epoch": 2.3, "learning_rate": 7.058312205722564e-05, "loss": 0.6374, "step": 10660 }, { "epoch": 2.3, "learning_rate": 7.014849692140528e-05, "loss": 0.6464, "step": 10680 }, { "epoch": 2.31, "learning_rate": 6.971387178558493e-05, "loss": 0.643, "step": 10700 }, { "epoch": 2.31, "learning_rate": 6.927924664976458e-05, "loss": 0.6384, "step": 10720 }, { "epoch": 2.32, "learning_rate": 6.884462151394421e-05, "loss": 0.6451, "step": 10740 }, { "epoch": 2.32, "learning_rate": 6.840999637812386e-05, "loss": 0.6465, "step": 10760 }, { "epoch": 2.33, "learning_rate": 6.799710249909452e-05, "loss": 0.646, "step": 10780 }, { "epoch": 2.33, "learning_rate": 6.756247736327417e-05, "loss": 0.6525, "step": 10800 }, { "epoch": 2.33, "eval_loss": 0.6714358925819397, "eval_runtime": 50.1294, "eval_samples_per_second": 39.897, "eval_steps_per_second": 0.638, "step": 10800 }, { "epoch": 2.33, "learning_rate": 6.712785222745382e-05, "loss": 0.6423, "step": 10820 }, { "epoch": 2.34, "learning_rate": 6.669322709163345e-05, "loss": 0.6449, "step": 10840 }, { "epoch": 2.34, "learning_rate": 6.62586019558131e-05, "loss": 0.6325, "step": 10860 }, { "epoch": 2.35, "learning_rate": 6.582397681999275e-05, "loss": 0.6558, "step": 10880 }, { "epoch": 2.35, "learning_rate": 6.538935168417239e-05, "loss": 0.6419, "step": 10900 }, { "epoch": 2.36, "learning_rate": 6.495472654835204e-05, "loss": 0.6466, "step": 10920 }, { "epoch": 2.36, "learning_rate": 6.452010141253169e-05, "loss": 0.6357, "step": 10940 }, { "epoch": 2.36, "learning_rate": 6.408547627671133e-05, "loss": 0.6366, "step": 10960 }, { "epoch": 2.37, "learning_rate": 6.365085114089098e-05, "loss": 0.6466, "step": 10980 }, { "epoch": 2.37, "learning_rate": 6.321622600507063e-05, "loss": 0.6542, "step": 11000 }, { "epoch": 2.37, "eval_loss": 0.6710445880889893, "eval_runtime": 50.2479, "eval_samples_per_second": 39.803, "eval_steps_per_second": 0.637, "step": 11000 }, { "epoch": 2.38, "learning_rate": 6.278160086925026e-05, "loss": 0.6481, "step": 11020 }, { "epoch": 2.38, "learning_rate": 6.23469757334299e-05, "loss": 0.6425, "step": 11040 }, { "epoch": 2.39, "learning_rate": 6.191235059760955e-05, "loss": 0.6439, "step": 11060 }, { "epoch": 2.39, "learning_rate": 6.14777254617892e-05, "loss": 0.6424, "step": 11080 }, { "epoch": 2.39, "learning_rate": 6.104310032596884e-05, "loss": 0.6404, "step": 11100 }, { "epoch": 2.4, "learning_rate": 6.060847519014849e-05, "loss": 0.6387, "step": 11120 }, { "epoch": 2.4, "learning_rate": 6.017385005432814e-05, "loss": 0.6462, "step": 11140 }, { "epoch": 2.41, "learning_rate": 5.973922491850778e-05, "loss": 0.6431, "step": 11160 }, { "epoch": 2.41, "learning_rate": 5.9304599782687424e-05, "loss": 0.638, "step": 11180 }, { "epoch": 2.42, "learning_rate": 5.8869974646867074e-05, "loss": 0.6344, "step": 11200 }, { "epoch": 2.42, "eval_loss": 0.6704220771789551, "eval_runtime": 50.1558, "eval_samples_per_second": 39.876, "eval_steps_per_second": 0.638, "step": 11200 }, { "epoch": 2.42, "learning_rate": 5.843534951104672e-05, "loss": 0.6448, "step": 11220 }, { "epoch": 2.43, "learning_rate": 5.800072437522636e-05, "loss": 0.6449, "step": 11240 }, { "epoch": 2.43, "learning_rate": 5.756609923940601e-05, "loss": 0.6399, "step": 11260 }, { "epoch": 2.43, "learning_rate": 5.7131474103585654e-05, "loss": 0.638, "step": 11280 }, { "epoch": 2.44, "learning_rate": 5.66968489677653e-05, "loss": 0.6418, "step": 11300 }, { "epoch": 2.44, "learning_rate": 5.626222383194495e-05, "loss": 0.6482, "step": 11320 }, { "epoch": 2.45, "learning_rate": 5.582759869612459e-05, "loss": 0.6392, "step": 11340 }, { "epoch": 2.45, "learning_rate": 5.5392973560304233e-05, "loss": 0.6363, "step": 11360 }, { "epoch": 2.46, "learning_rate": 5.4958348424483883e-05, "loss": 0.6503, "step": 11380 }, { "epoch": 2.46, "learning_rate": 5.452372328866353e-05, "loss": 0.6453, "step": 11400 }, { "epoch": 2.46, "eval_loss": 0.670009195804596, "eval_runtime": 50.155, "eval_samples_per_second": 39.876, "eval_steps_per_second": 0.638, "step": 11400 }, { "epoch": 2.46, "learning_rate": 5.408909815284317e-05, "loss": 0.6384, "step": 11420 }, { "epoch": 2.47, "learning_rate": 5.365447301702282e-05, "loss": 0.6449, "step": 11440 }, { "epoch": 2.47, "learning_rate": 5.3219847881202456e-05, "loss": 0.6406, "step": 11460 }, { "epoch": 2.48, "learning_rate": 5.27852227453821e-05, "loss": 0.6363, "step": 11480 }, { "epoch": 2.48, "learning_rate": 5.235059760956174e-05, "loss": 0.6482, "step": 11500 }, { "epoch": 2.49, "learning_rate": 5.191597247374139e-05, "loss": 0.6503, "step": 11520 }, { "epoch": 2.49, "learning_rate": 5.1481347337921036e-05, "loss": 0.6479, "step": 11540 }, { "epoch": 2.49, "learning_rate": 5.10684534588917e-05, "loss": 0.6437, "step": 11560 }, { "epoch": 2.5, "learning_rate": 5.063382832307134e-05, "loss": 0.6398, "step": 11580 }, { "epoch": 2.5, "learning_rate": 5.0199203187250985e-05, "loss": 0.6456, "step": 11600 }, { "epoch": 2.5, "eval_loss": 0.6702134013175964, "eval_runtime": 50.1834, "eval_samples_per_second": 39.854, "eval_steps_per_second": 0.638, "step": 11600 }, { "epoch": 2.51, "learning_rate": 4.9764578051430635e-05, "loss": 0.646, "step": 11620 }, { "epoch": 2.51, "learning_rate": 4.932995291561028e-05, "loss": 0.6375, "step": 11640 }, { "epoch": 2.52, "learning_rate": 4.889532777978992e-05, "loss": 0.6393, "step": 11660 }, { "epoch": 2.52, "learning_rate": 4.846070264396957e-05, "loss": 0.638, "step": 11680 }, { "epoch": 2.52, "learning_rate": 4.8026077508149215e-05, "loss": 0.6411, "step": 11700 }, { "epoch": 2.53, "learning_rate": 4.759145237232886e-05, "loss": 0.6467, "step": 11720 }, { "epoch": 2.53, "learning_rate": 4.715682723650851e-05, "loss": 0.6369, "step": 11740 }, { "epoch": 2.54, "learning_rate": 4.672220210068815e-05, "loss": 0.637, "step": 11760 }, { "epoch": 2.54, "learning_rate": 4.6287576964867795e-05, "loss": 0.6486, "step": 11780 }, { "epoch": 2.55, "learning_rate": 4.5852951829047445e-05, "loss": 0.637, "step": 11800 }, { "epoch": 2.55, "eval_loss": 0.6698750257492065, "eval_runtime": 50.1539, "eval_samples_per_second": 39.877, "eval_steps_per_second": 0.638, "step": 11800 }, { "epoch": 2.55, "learning_rate": 4.541832669322709e-05, "loss": 0.639, "step": 11820 }, { "epoch": 2.55, "learning_rate": 4.498370155740673e-05, "loss": 0.6366, "step": 11840 }, { "epoch": 2.56, "learning_rate": 4.454907642158638e-05, "loss": 0.6409, "step": 11860 }, { "epoch": 2.56, "learning_rate": 4.4114451285766025e-05, "loss": 0.6394, "step": 11880 }, { "epoch": 2.57, "learning_rate": 4.367982614994567e-05, "loss": 0.6351, "step": 11900 }, { "epoch": 2.57, "learning_rate": 4.324520101412532e-05, "loss": 0.6391, "step": 11920 }, { "epoch": 2.58, "learning_rate": 4.281057587830496e-05, "loss": 0.6267, "step": 11940 }, { "epoch": 2.58, "learning_rate": 4.2375950742484604e-05, "loss": 0.6461, "step": 11960 }, { "epoch": 2.58, "learning_rate": 4.194132560666425e-05, "loss": 0.6483, "step": 11980 }, { "epoch": 2.59, "learning_rate": 4.150670047084389e-05, "loss": 0.6461, "step": 12000 }, { "epoch": 2.59, "eval_loss": 0.6692882180213928, "eval_runtime": 50.1673, "eval_samples_per_second": 39.867, "eval_steps_per_second": 0.638, "step": 12000 }, { "epoch": 2.59, "learning_rate": 4.1072075335023534e-05, "loss": 0.6429, "step": 12020 }, { "epoch": 2.6, "learning_rate": 4.0637450199203184e-05, "loss": 0.6416, "step": 12040 }, { "epoch": 2.6, "learning_rate": 4.020282506338283e-05, "loss": 0.6356, "step": 12060 }, { "epoch": 2.61, "learning_rate": 3.976819992756247e-05, "loss": 0.6402, "step": 12080 }, { "epoch": 2.61, "learning_rate": 3.933357479174212e-05, "loss": 0.6395, "step": 12100 }, { "epoch": 2.61, "learning_rate": 3.8898949655921764e-05, "loss": 0.6432, "step": 12120 }, { "epoch": 2.62, "learning_rate": 3.846432452010141e-05, "loss": 0.6386, "step": 12140 }, { "epoch": 2.62, "learning_rate": 3.802969938428106e-05, "loss": 0.6396, "step": 12160 }, { "epoch": 2.63, "learning_rate": 3.75950742484607e-05, "loss": 0.6423, "step": 12180 }, { "epoch": 2.63, "learning_rate": 3.7160449112640344e-05, "loss": 0.649, "step": 12200 }, { "epoch": 2.63, "eval_loss": 0.6691960096359253, "eval_runtime": 50.1649, "eval_samples_per_second": 39.869, "eval_steps_per_second": 0.638, "step": 12200 }, { "epoch": 2.64, "learning_rate": 3.672582397681999e-05, "loss": 0.6547, "step": 12220 }, { "epoch": 2.64, "learning_rate": 3.629119884099964e-05, "loss": 0.642, "step": 12240 }, { "epoch": 2.65, "learning_rate": 3.585657370517928e-05, "loss": 0.634, "step": 12260 }, { "epoch": 2.65, "learning_rate": 3.542194856935892e-05, "loss": 0.6447, "step": 12280 }, { "epoch": 2.65, "learning_rate": 3.498732343353857e-05, "loss": 0.6285, "step": 12300 }, { "epoch": 2.66, "learning_rate": 3.455269829771822e-05, "loss": 0.6436, "step": 12320 }, { "epoch": 2.66, "learning_rate": 3.411807316189786e-05, "loss": 0.6349, "step": 12340 }, { "epoch": 2.67, "learning_rate": 3.36834480260775e-05, "loss": 0.6425, "step": 12360 }, { "epoch": 2.67, "learning_rate": 3.324882289025715e-05, "loss": 0.6393, "step": 12380 }, { "epoch": 2.68, "learning_rate": 3.2814197754436796e-05, "loss": 0.6367, "step": 12400 }, { "epoch": 2.68, "eval_loss": 0.6687243580818176, "eval_runtime": 50.3508, "eval_samples_per_second": 39.721, "eval_steps_per_second": 0.636, "step": 12400 }, { "epoch": 2.68, "learning_rate": 3.237957261861644e-05, "loss": 0.6386, "step": 12420 }, { "epoch": 2.68, "learning_rate": 3.194494748279609e-05, "loss": 0.6526, "step": 12440 }, { "epoch": 2.69, "learning_rate": 3.151032234697573e-05, "loss": 0.6357, "step": 12460 }, { "epoch": 2.69, "learning_rate": 3.1075697211155376e-05, "loss": 0.6353, "step": 12480 }, { "epoch": 2.7, "learning_rate": 3.0641072075335026e-05, "loss": 0.6449, "step": 12500 }, { "epoch": 2.7, "learning_rate": 3.0206446939514663e-05, "loss": 0.6425, "step": 12520 }, { "epoch": 2.71, "learning_rate": 2.977182180369431e-05, "loss": 0.6374, "step": 12540 }, { "epoch": 2.71, "learning_rate": 2.9337196667873956e-05, "loss": 0.6324, "step": 12560 }, { "epoch": 2.71, "learning_rate": 2.89025715320536e-05, "loss": 0.6502, "step": 12580 }, { "epoch": 2.72, "learning_rate": 2.8467946396233246e-05, "loss": 0.637, "step": 12600 }, { "epoch": 2.72, "eval_loss": 0.6683821082115173, "eval_runtime": 50.2054, "eval_samples_per_second": 39.836, "eval_steps_per_second": 0.637, "step": 12600 }, { "epoch": 2.72, "learning_rate": 2.8033321260412892e-05, "loss": 0.647, "step": 12620 }, { "epoch": 2.73, "learning_rate": 2.7598696124592536e-05, "loss": 0.632, "step": 12640 }, { "epoch": 2.73, "learning_rate": 2.7164070988772182e-05, "loss": 0.6411, "step": 12660 }, { "epoch": 2.74, "learning_rate": 2.672944585295183e-05, "loss": 0.632, "step": 12680 }, { "epoch": 2.74, "learning_rate": 2.6294820717131472e-05, "loss": 0.6389, "step": 12700 }, { "epoch": 2.74, "learning_rate": 2.586019558131112e-05, "loss": 0.6337, "step": 12720 }, { "epoch": 2.75, "learning_rate": 2.542557044549076e-05, "loss": 0.6439, "step": 12740 }, { "epoch": 2.75, "learning_rate": 2.4990945309670405e-05, "loss": 0.6364, "step": 12760 }, { "epoch": 2.76, "learning_rate": 2.4556320173850052e-05, "loss": 0.6402, "step": 12780 }, { "epoch": 2.76, "learning_rate": 2.4121695038029695e-05, "loss": 0.6376, "step": 12800 }, { "epoch": 2.76, "eval_loss": 0.6680713295936584, "eval_runtime": 50.1757, "eval_samples_per_second": 39.86, "eval_steps_per_second": 0.638, "step": 12800 }, { "epoch": 2.77, "learning_rate": 2.3687069902209342e-05, "loss": 0.6316, "step": 12820 }, { "epoch": 2.77, "learning_rate": 2.325244476638899e-05, "loss": 0.6393, "step": 12840 }, { "epoch": 2.77, "learning_rate": 2.281781963056863e-05, "loss": 0.6372, "step": 12860 }, { "epoch": 2.78, "learning_rate": 2.2383194494748278e-05, "loss": 0.6466, "step": 12880 }, { "epoch": 2.78, "learning_rate": 2.1948569358927925e-05, "loss": 0.6392, "step": 12900 }, { "epoch": 2.79, "learning_rate": 2.1513944223107568e-05, "loss": 0.6389, "step": 12920 }, { "epoch": 2.79, "learning_rate": 2.107931908728721e-05, "loss": 0.64, "step": 12940 }, { "epoch": 2.8, "learning_rate": 2.0644693951466858e-05, "loss": 0.6362, "step": 12960 }, { "epoch": 2.8, "learning_rate": 2.02100688156465e-05, "loss": 0.6364, "step": 12980 }, { "epoch": 2.8, "learning_rate": 1.9775443679826148e-05, "loss": 0.6372, "step": 13000 }, { "epoch": 2.8, "eval_loss": 0.6680414080619812, "eval_runtime": 50.2211, "eval_samples_per_second": 39.824, "eval_steps_per_second": 0.637, "step": 13000 }, { "epoch": 2.81, "learning_rate": 1.9340818544005794e-05, "loss": 0.6336, "step": 13020 }, { "epoch": 2.81, "learning_rate": 1.8906193408185438e-05, "loss": 0.6348, "step": 13040 }, { "epoch": 2.82, "learning_rate": 1.8471568272365084e-05, "loss": 0.6338, "step": 13060 }, { "epoch": 2.82, "learning_rate": 1.8036943136544728e-05, "loss": 0.6396, "step": 13080 }, { "epoch": 2.83, "learning_rate": 1.7602318000724374e-05, "loss": 0.641, "step": 13100 }, { "epoch": 2.83, "learning_rate": 1.7167692864904017e-05, "loss": 0.6369, "step": 13120 }, { "epoch": 2.83, "learning_rate": 1.6733067729083664e-05, "loss": 0.6345, "step": 13140 }, { "epoch": 2.84, "learning_rate": 1.629844259326331e-05, "loss": 0.649, "step": 13160 }, { "epoch": 2.84, "learning_rate": 1.5863817457442954e-05, "loss": 0.6409, "step": 13180 }, { "epoch": 2.85, "learning_rate": 1.54291923216226e-05, "loss": 0.63, "step": 13200 }, { "epoch": 2.85, "eval_loss": 0.6678950190544128, "eval_runtime": 50.1908, "eval_samples_per_second": 39.848, "eval_steps_per_second": 0.638, "step": 13200 }, { "epoch": 2.85, "learning_rate": 1.4994567185802244e-05, "loss": 0.6428, "step": 13220 }, { "epoch": 2.86, "learning_rate": 1.4559942049981889e-05, "loss": 0.645, "step": 13240 }, { "epoch": 2.86, "learning_rate": 1.4125316914161534e-05, "loss": 0.6434, "step": 13260 }, { "epoch": 2.87, "learning_rate": 1.369069177834118e-05, "loss": 0.6462, "step": 13280 }, { "epoch": 2.87, "learning_rate": 1.3256066642520825e-05, "loss": 0.6387, "step": 13300 }, { "epoch": 2.87, "learning_rate": 1.2821441506700468e-05, "loss": 0.6311, "step": 13320 }, { "epoch": 2.88, "learning_rate": 1.2386816370880113e-05, "loss": 0.6446, "step": 13340 }, { "epoch": 2.88, "learning_rate": 1.195219123505976e-05, "loss": 0.6426, "step": 13360 }, { "epoch": 2.89, "learning_rate": 1.1517566099239405e-05, "loss": 0.6369, "step": 13380 }, { "epoch": 2.89, "learning_rate": 1.108294096341905e-05, "loss": 0.6467, "step": 13400 }, { "epoch": 2.89, "eval_loss": 0.6676326990127563, "eval_runtime": 50.1589, "eval_samples_per_second": 39.873, "eval_steps_per_second": 0.638, "step": 13400 }, { "epoch": 2.9, "learning_rate": 1.0648315827598697e-05, "loss": 0.6347, "step": 13420 }, { "epoch": 2.9, "learning_rate": 1.021369069177834e-05, "loss": 0.6364, "step": 13440 }, { "epoch": 2.9, "learning_rate": 9.779065555957985e-06, "loss": 0.6309, "step": 13460 }, { "epoch": 2.91, "learning_rate": 9.34444042013763e-06, "loss": 0.6407, "step": 13480 }, { "epoch": 2.91, "learning_rate": 8.909815284317276e-06, "loss": 0.6389, "step": 13500 }, { "epoch": 2.92, "learning_rate": 8.475190148496921e-06, "loss": 0.6378, "step": 13520 }, { "epoch": 2.92, "learning_rate": 8.040565012676566e-06, "loss": 0.6359, "step": 13540 }, { "epoch": 2.93, "learning_rate": 7.60593987685621e-06, "loss": 0.6282, "step": 13560 }, { "epoch": 2.93, "learning_rate": 7.171314741035856e-06, "loss": 0.6409, "step": 13580 }, { "epoch": 2.93, "learning_rate": 6.736689605215501e-06, "loss": 0.6339, "step": 13600 }, { "epoch": 2.93, "eval_loss": 0.6675477027893066, "eval_runtime": 50.3638, "eval_samples_per_second": 39.711, "eval_steps_per_second": 0.635, "step": 13600 }, { "epoch": 2.94, "learning_rate": 6.302064469395146e-06, "loss": 0.6306, "step": 13620 }, { "epoch": 2.94, "learning_rate": 5.867439333574791e-06, "loss": 0.6438, "step": 13640 }, { "epoch": 2.95, "learning_rate": 5.432814197754437e-06, "loss": 0.6372, "step": 13660 }, { "epoch": 2.95, "learning_rate": 4.9981890619340815e-06, "loss": 0.6373, "step": 13680 }, { "epoch": 2.96, "learning_rate": 4.5635639261137265e-06, "loss": 0.6441, "step": 13700 }, { "epoch": 2.96, "learning_rate": 4.128938790293371e-06, "loss": 0.6486, "step": 13720 }, { "epoch": 2.96, "learning_rate": 3.6943136544730164e-06, "loss": 0.6359, "step": 13740 }, { "epoch": 2.97, "learning_rate": 3.259688518652662e-06, "loss": 0.6401, "step": 13760 }, { "epoch": 2.97, "learning_rate": 2.825063382832307e-06, "loss": 0.6493, "step": 13780 }, { "epoch": 2.98, "learning_rate": 2.390438247011952e-06, "loss": 0.6368, "step": 13800 }, { "epoch": 2.98, "eval_loss": 0.6671983599662781, "eval_runtime": 50.179, "eval_samples_per_second": 39.857, "eval_steps_per_second": 0.638, "step": 13800 } ], "max_steps": 13905, "num_train_epochs": 3, "total_flos": 1.7542324274428117e+20, "trial_name": null, "trial_params": null }