{ "best_metric": 0.712183952331543, "best_model_checkpoint": "/mnt/bn/qingyi-bn-lq/llama/saved-alpaca-belle13b/checkpoint-13600", "epoch": 2.9341963322545848, "global_step": 13600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.9999999999999995e-05, "loss": 1.6589, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.00011999999999999999, "loss": 1.4071, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.00017999999999999998, "loss": 1.044, "step": 60 }, { "epoch": 0.02, "learning_rate": 0.00023999999999999998, "loss": 0.9883, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.0003, "loss": 0.9659, "step": 100 }, { "epoch": 0.03, "learning_rate": 0.00029956537486417964, "loss": 0.9505, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.00029913074972835925, "loss": 0.9205, "step": 140 }, { "epoch": 0.03, "learning_rate": 0.0002986961245925389, "loss": 0.9168, "step": 160 }, { "epoch": 0.04, "learning_rate": 0.0002982614994567186, "loss": 0.9117, "step": 180 }, { "epoch": 0.04, "learning_rate": 0.0002978268743208982, "loss": 0.9064, "step": 200 }, { "epoch": 0.04, "eval_loss": 0.9033477306365967, "eval_runtime": 25.3136, "eval_samples_per_second": 79.009, "eval_steps_per_second": 1.264, "step": 200 }, { "epoch": 0.05, "learning_rate": 0.00029739224918507785, "loss": 0.8981, "step": 220 }, { "epoch": 0.05, "learning_rate": 0.0002969576240492575, "loss": 0.8912, "step": 240 }, { "epoch": 0.06, "learning_rate": 0.0002965229989134371, "loss": 0.8875, "step": 260 }, { "epoch": 0.06, "learning_rate": 0.0002960883737776168, "loss": 0.8907, "step": 280 }, { "epoch": 0.06, "learning_rate": 0.00029565374864179645, "loss": 0.8753, "step": 300 }, { "epoch": 0.07, "learning_rate": 0.00029521912350597606, "loss": 0.8782, "step": 320 }, { "epoch": 0.07, "learning_rate": 0.0002947844983701557, "loss": 0.8697, "step": 340 }, { "epoch": 0.08, "learning_rate": 0.0002943498732343354, "loss": 0.8745, "step": 360 }, { "epoch": 0.08, "learning_rate": 0.000293915248098515, "loss": 0.8725, "step": 380 }, { "epoch": 0.09, "learning_rate": 0.00029348062296269466, "loss": 0.8658, "step": 400 }, { "epoch": 0.09, "eval_loss": 0.8655584454536438, "eval_runtime": 25.3343, "eval_samples_per_second": 78.944, "eval_steps_per_second": 1.263, "step": 400 }, { "epoch": 0.09, "learning_rate": 0.0002930459978268743, "loss": 0.8641, "step": 420 }, { "epoch": 0.09, "learning_rate": 0.00029261137269105393, "loss": 0.8509, "step": 440 }, { "epoch": 0.1, "learning_rate": 0.0002921767475552336, "loss": 0.8541, "step": 460 }, { "epoch": 0.1, "learning_rate": 0.00029174212241941326, "loss": 0.8575, "step": 480 }, { "epoch": 0.11, "learning_rate": 0.00029130749728359287, "loss": 0.8482, "step": 500 }, { "epoch": 0.11, "learning_rate": 0.00029087287214777253, "loss": 0.8572, "step": 520 }, { "epoch": 0.12, "learning_rate": 0.0002904382470119522, "loss": 0.8489, "step": 540 }, { "epoch": 0.12, "learning_rate": 0.0002900036218761318, "loss": 0.8585, "step": 560 }, { "epoch": 0.13, "learning_rate": 0.00028956899674031147, "loss": 0.8387, "step": 580 }, { "epoch": 0.13, "learning_rate": 0.00028913437160449113, "loss": 0.8306, "step": 600 }, { "epoch": 0.13, "eval_loss": 0.8434031009674072, "eval_runtime": 25.3211, "eval_samples_per_second": 78.986, "eval_steps_per_second": 1.264, "step": 600 }, { "epoch": 0.13, "learning_rate": 0.00028869974646867074, "loss": 0.8331, "step": 620 }, { "epoch": 0.14, "learning_rate": 0.0002882651213328504, "loss": 0.8447, "step": 640 }, { "epoch": 0.14, "learning_rate": 0.00028783049619703007, "loss": 0.836, "step": 660 }, { "epoch": 0.15, "learning_rate": 0.0002873958710612097, "loss": 0.8436, "step": 680 }, { "epoch": 0.15, "learning_rate": 0.00028696124592538934, "loss": 0.8281, "step": 700 }, { "epoch": 0.16, "learning_rate": 0.000286526620789569, "loss": 0.8378, "step": 720 }, { "epoch": 0.16, "learning_rate": 0.0002860919956537486, "loss": 0.8338, "step": 740 }, { "epoch": 0.16, "learning_rate": 0.0002856573705179283, "loss": 0.8323, "step": 760 }, { "epoch": 0.17, "learning_rate": 0.00028522274538210794, "loss": 0.8153, "step": 780 }, { "epoch": 0.17, "learning_rate": 0.00028478812024628755, "loss": 0.8349, "step": 800 }, { "epoch": 0.17, "eval_loss": 0.8282934427261353, "eval_runtime": 25.4025, "eval_samples_per_second": 78.733, "eval_steps_per_second": 1.26, "step": 800 }, { "epoch": 0.18, "learning_rate": 0.0002843534951104672, "loss": 0.8198, "step": 820 }, { "epoch": 0.18, "learning_rate": 0.0002839188699746469, "loss": 0.8254, "step": 840 }, { "epoch": 0.19, "learning_rate": 0.0002834842448388265, "loss": 0.8165, "step": 860 }, { "epoch": 0.19, "learning_rate": 0.00028304961970300615, "loss": 0.8241, "step": 880 }, { "epoch": 0.19, "learning_rate": 0.0002826149945671858, "loss": 0.814, "step": 900 }, { "epoch": 0.2, "learning_rate": 0.0002821803694313654, "loss": 0.8222, "step": 920 }, { "epoch": 0.2, "learning_rate": 0.0002817457442955451, "loss": 0.825, "step": 940 }, { "epoch": 0.21, "learning_rate": 0.00028131111915972475, "loss": 0.8153, "step": 960 }, { "epoch": 0.21, "learning_rate": 0.00028087649402390436, "loss": 0.8229, "step": 980 }, { "epoch": 0.22, "learning_rate": 0.00028044186888808397, "loss": 0.8129, "step": 1000 }, { "epoch": 0.22, "eval_loss": 0.816320538520813, "eval_runtime": 25.4153, "eval_samples_per_second": 78.693, "eval_steps_per_second": 1.259, "step": 1000 }, { "epoch": 0.22, "learning_rate": 0.00028000724375226363, "loss": 0.8121, "step": 1020 }, { "epoch": 0.22, "learning_rate": 0.0002795726186164433, "loss": 0.8063, "step": 1040 }, { "epoch": 0.23, "learning_rate": 0.0002791379934806229, "loss": 0.8097, "step": 1060 }, { "epoch": 0.23, "learning_rate": 0.00027870336834480257, "loss": 0.8142, "step": 1080 }, { "epoch": 0.24, "learning_rate": 0.00027826874320898223, "loss": 0.8021, "step": 1100 }, { "epoch": 0.24, "learning_rate": 0.00027783411807316184, "loss": 0.8014, "step": 1120 }, { "epoch": 0.25, "learning_rate": 0.0002773994929373415, "loss": 0.8031, "step": 1140 }, { "epoch": 0.25, "learning_rate": 0.00027696486780152117, "loss": 0.8011, "step": 1160 }, { "epoch": 0.25, "learning_rate": 0.0002765302426657008, "loss": 0.7944, "step": 1180 }, { "epoch": 0.26, "learning_rate": 0.00027609561752988044, "loss": 0.8071, "step": 1200 }, { "epoch": 0.26, "eval_loss": 0.8064733147621155, "eval_runtime": 25.3901, "eval_samples_per_second": 78.771, "eval_steps_per_second": 1.26, "step": 1200 }, { "epoch": 0.26, "learning_rate": 0.0002756609923940601, "loss": 0.8025, "step": 1220 }, { "epoch": 0.27, "learning_rate": 0.0002752263672582397, "loss": 0.7954, "step": 1240 }, { "epoch": 0.27, "learning_rate": 0.0002747917421224194, "loss": 0.8013, "step": 1260 }, { "epoch": 0.28, "learning_rate": 0.00027435711698659904, "loss": 0.7967, "step": 1280 }, { "epoch": 0.28, "learning_rate": 0.00027392249185077865, "loss": 0.8132, "step": 1300 }, { "epoch": 0.28, "learning_rate": 0.0002734878667149583, "loss": 0.8017, "step": 1320 }, { "epoch": 0.29, "learning_rate": 0.000273053241579138, "loss": 0.7964, "step": 1340 }, { "epoch": 0.29, "learning_rate": 0.0002726186164433176, "loss": 0.8012, "step": 1360 }, { "epoch": 0.3, "learning_rate": 0.00027218399130749725, "loss": 0.7982, "step": 1380 }, { "epoch": 0.3, "learning_rate": 0.0002717493661716769, "loss": 0.8031, "step": 1400 }, { "epoch": 0.3, "eval_loss": 0.798474133014679, "eval_runtime": 25.432, "eval_samples_per_second": 78.641, "eval_steps_per_second": 1.258, "step": 1400 }, { "epoch": 0.31, "learning_rate": 0.0002713147410358565, "loss": 0.7925, "step": 1420 }, { "epoch": 0.31, "learning_rate": 0.0002708801159000362, "loss": 0.794, "step": 1440 }, { "epoch": 0.31, "learning_rate": 0.00027044549076421585, "loss": 0.804, "step": 1460 }, { "epoch": 0.32, "learning_rate": 0.00027001086562839546, "loss": 0.7942, "step": 1480 }, { "epoch": 0.32, "learning_rate": 0.0002695762404925751, "loss": 0.7872, "step": 1500 }, { "epoch": 0.33, "learning_rate": 0.0002691416153567548, "loss": 0.7962, "step": 1520 }, { "epoch": 0.33, "learning_rate": 0.0002687069902209344, "loss": 0.7898, "step": 1540 }, { "epoch": 0.34, "learning_rate": 0.00026827236508511406, "loss": 0.7886, "step": 1560 }, { "epoch": 0.34, "learning_rate": 0.0002678377399492937, "loss": 0.7904, "step": 1580 }, { "epoch": 0.35, "learning_rate": 0.00026740311481347333, "loss": 0.7892, "step": 1600 }, { "epoch": 0.35, "eval_loss": 0.7912269234657288, "eval_runtime": 25.444, "eval_samples_per_second": 78.604, "eval_steps_per_second": 1.258, "step": 1600 }, { "epoch": 0.35, "learning_rate": 0.000266968489677653, "loss": 0.7897, "step": 1620 }, { "epoch": 0.35, "learning_rate": 0.00026653386454183266, "loss": 0.7927, "step": 1640 }, { "epoch": 0.36, "learning_rate": 0.00026609923940601227, "loss": 0.7829, "step": 1660 }, { "epoch": 0.36, "learning_rate": 0.00026566461427019193, "loss": 0.7788, "step": 1680 }, { "epoch": 0.37, "learning_rate": 0.0002652299891343716, "loss": 0.786, "step": 1700 }, { "epoch": 0.37, "learning_rate": 0.0002647953639985512, "loss": 0.7828, "step": 1720 }, { "epoch": 0.38, "learning_rate": 0.00026436073886273087, "loss": 0.7788, "step": 1740 }, { "epoch": 0.38, "learning_rate": 0.00026392611372691053, "loss": 0.7851, "step": 1760 }, { "epoch": 0.38, "learning_rate": 0.00026349148859109014, "loss": 0.7936, "step": 1780 }, { "epoch": 0.39, "learning_rate": 0.0002630568634552698, "loss": 0.7758, "step": 1800 }, { "epoch": 0.39, "eval_loss": 0.7854430675506592, "eval_runtime": 25.4734, "eval_samples_per_second": 78.513, "eval_steps_per_second": 1.256, "step": 1800 }, { "epoch": 0.39, "learning_rate": 0.00026262223831944947, "loss": 0.787, "step": 1820 }, { "epoch": 0.4, "learning_rate": 0.0002621876131836291, "loss": 0.7779, "step": 1840 }, { "epoch": 0.4, "learning_rate": 0.00026175298804780874, "loss": 0.7792, "step": 1860 }, { "epoch": 0.41, "learning_rate": 0.0002613183629119884, "loss": 0.7728, "step": 1880 }, { "epoch": 0.41, "learning_rate": 0.000260883737776168, "loss": 0.7844, "step": 1900 }, { "epoch": 0.41, "learning_rate": 0.0002604491126403477, "loss": 0.7726, "step": 1920 }, { "epoch": 0.42, "learning_rate": 0.00026001448750452734, "loss": 0.7706, "step": 1940 }, { "epoch": 0.42, "learning_rate": 0.00025957986236870695, "loss": 0.7659, "step": 1960 }, { "epoch": 0.43, "learning_rate": 0.0002591452372328866, "loss": 0.7808, "step": 1980 }, { "epoch": 0.43, "learning_rate": 0.0002587106120970663, "loss": 0.7692, "step": 2000 }, { "epoch": 0.43, "eval_loss": 0.7800412774085999, "eval_runtime": 25.5146, "eval_samples_per_second": 78.387, "eval_steps_per_second": 1.254, "step": 2000 }, { "epoch": 0.44, "learning_rate": 0.0002582759869612459, "loss": 0.7665, "step": 2020 }, { "epoch": 0.44, "learning_rate": 0.00025784136182542555, "loss": 0.7795, "step": 2040 }, { "epoch": 0.44, "learning_rate": 0.0002574067366896052, "loss": 0.7846, "step": 2060 }, { "epoch": 0.45, "learning_rate": 0.0002569721115537848, "loss": 0.7639, "step": 2080 }, { "epoch": 0.45, "learning_rate": 0.0002565374864179645, "loss": 0.7827, "step": 2100 }, { "epoch": 0.46, "learning_rate": 0.00025610286128214415, "loss": 0.7751, "step": 2120 }, { "epoch": 0.46, "learning_rate": 0.00025566823614632376, "loss": 0.776, "step": 2140 }, { "epoch": 0.47, "learning_rate": 0.0002552336110105034, "loss": 0.7773, "step": 2160 }, { "epoch": 0.47, "learning_rate": 0.0002547989858746831, "loss": 0.7757, "step": 2180 }, { "epoch": 0.47, "learning_rate": 0.0002543643607388627, "loss": 0.7769, "step": 2200 }, { "epoch": 0.47, "eval_loss": 0.7759379744529724, "eval_runtime": 25.4789, "eval_samples_per_second": 78.496, "eval_steps_per_second": 1.256, "step": 2200 }, { "epoch": 0.48, "learning_rate": 0.00025392973560304236, "loss": 0.7657, "step": 2220 }, { "epoch": 0.48, "learning_rate": 0.000253495110467222, "loss": 0.7664, "step": 2240 }, { "epoch": 0.49, "learning_rate": 0.00025306048533140163, "loss": 0.7774, "step": 2260 }, { "epoch": 0.49, "learning_rate": 0.0002526258601955813, "loss": 0.7591, "step": 2280 }, { "epoch": 0.5, "learning_rate": 0.00025219123505976096, "loss": 0.7605, "step": 2300 }, { "epoch": 0.5, "learning_rate": 0.00025175660992394057, "loss": 0.7693, "step": 2320 }, { "epoch": 0.5, "learning_rate": 0.00025132198478812023, "loss": 0.7702, "step": 2340 }, { "epoch": 0.51, "learning_rate": 0.0002508873596522999, "loss": 0.7706, "step": 2360 }, { "epoch": 0.51, "learning_rate": 0.0002504527345164795, "loss": 0.7664, "step": 2380 }, { "epoch": 0.52, "learning_rate": 0.00025001810938065917, "loss": 0.76, "step": 2400 }, { "epoch": 0.52, "eval_loss": 0.7723669409751892, "eval_runtime": 25.4827, "eval_samples_per_second": 78.485, "eval_steps_per_second": 1.256, "step": 2400 }, { "epoch": 0.52, "learning_rate": 0.00024958348424483883, "loss": 0.7702, "step": 2420 }, { "epoch": 0.53, "learning_rate": 0.00024914885910901844, "loss": 0.7686, "step": 2440 }, { "epoch": 0.53, "learning_rate": 0.0002487142339731981, "loss": 0.762, "step": 2460 }, { "epoch": 0.54, "learning_rate": 0.00024827960883737777, "loss": 0.7719, "step": 2480 }, { "epoch": 0.54, "learning_rate": 0.0002478449837015574, "loss": 0.7612, "step": 2500 }, { "epoch": 0.54, "learning_rate": 0.00024741035856573704, "loss": 0.7565, "step": 2520 }, { "epoch": 0.55, "learning_rate": 0.0002469757334299167, "loss": 0.7719, "step": 2540 }, { "epoch": 0.55, "learning_rate": 0.0002465411082940963, "loss": 0.7619, "step": 2560 }, { "epoch": 0.56, "learning_rate": 0.000246106483158276, "loss": 0.7607, "step": 2580 }, { "epoch": 0.56, "learning_rate": 0.00024567185802245564, "loss": 0.7564, "step": 2600 }, { "epoch": 0.56, "eval_loss": 0.7678729295730591, "eval_runtime": 25.4455, "eval_samples_per_second": 78.599, "eval_steps_per_second": 1.258, "step": 2600 }, { "epoch": 0.57, "learning_rate": 0.00024523723288663525, "loss": 0.7613, "step": 2620 }, { "epoch": 0.57, "learning_rate": 0.0002448026077508149, "loss": 0.7525, "step": 2640 }, { "epoch": 0.57, "learning_rate": 0.0002443679826149946, "loss": 0.7563, "step": 2660 }, { "epoch": 0.58, "learning_rate": 0.00024393335747917422, "loss": 0.7601, "step": 2680 }, { "epoch": 0.58, "learning_rate": 0.00024349873234335383, "loss": 0.7633, "step": 2700 }, { "epoch": 0.59, "learning_rate": 0.00024306410720753346, "loss": 0.75, "step": 2720 }, { "epoch": 0.59, "learning_rate": 0.0002426294820717131, "loss": 0.7602, "step": 2740 }, { "epoch": 0.6, "learning_rate": 0.00024219485693589276, "loss": 0.7546, "step": 2760 }, { "epoch": 0.6, "learning_rate": 0.0002417602318000724, "loss": 0.7532, "step": 2780 }, { "epoch": 0.6, "learning_rate": 0.00024132560666425203, "loss": 0.7661, "step": 2800 }, { "epoch": 0.6, "eval_loss": 0.7649803757667542, "eval_runtime": 25.4783, "eval_samples_per_second": 78.498, "eval_steps_per_second": 1.256, "step": 2800 }, { "epoch": 0.61, "learning_rate": 0.0002408909815284317, "loss": 0.7587, "step": 2820 }, { "epoch": 0.61, "learning_rate": 0.00024045635639261133, "loss": 0.7543, "step": 2840 }, { "epoch": 0.62, "learning_rate": 0.00024002173125679097, "loss": 0.7672, "step": 2860 }, { "epoch": 0.62, "learning_rate": 0.00023958710612097063, "loss": 0.7623, "step": 2880 }, { "epoch": 0.63, "learning_rate": 0.00023915248098515027, "loss": 0.7487, "step": 2900 }, { "epoch": 0.63, "learning_rate": 0.0002387178558493299, "loss": 0.75, "step": 2920 }, { "epoch": 0.63, "learning_rate": 0.00023828323071350957, "loss": 0.7567, "step": 2940 }, { "epoch": 0.64, "learning_rate": 0.0002378486055776892, "loss": 0.7592, "step": 2960 }, { "epoch": 0.64, "learning_rate": 0.00023741398044186884, "loss": 0.7569, "step": 2980 }, { "epoch": 0.65, "learning_rate": 0.0002369793553060485, "loss": 0.7524, "step": 3000 }, { "epoch": 0.65, "eval_loss": 0.7613279819488525, "eval_runtime": 25.4837, "eval_samples_per_second": 78.482, "eval_steps_per_second": 1.256, "step": 3000 }, { "epoch": 0.65, "learning_rate": 0.00023654473017022814, "loss": 0.7593, "step": 3020 }, { "epoch": 0.66, "learning_rate": 0.00023611010503440778, "loss": 0.7516, "step": 3040 }, { "epoch": 0.66, "learning_rate": 0.00023567547989858744, "loss": 0.7525, "step": 3060 }, { "epoch": 0.66, "learning_rate": 0.00023524085476276708, "loss": 0.7583, "step": 3080 }, { "epoch": 0.67, "learning_rate": 0.00023480622962694672, "loss": 0.7535, "step": 3100 }, { "epoch": 0.67, "learning_rate": 0.00023437160449112638, "loss": 0.7528, "step": 3120 }, { "epoch": 0.68, "learning_rate": 0.00023393697935530602, "loss": 0.7418, "step": 3140 }, { "epoch": 0.68, "learning_rate": 0.00023350235421948565, "loss": 0.7496, "step": 3160 }, { "epoch": 0.69, "learning_rate": 0.00023306772908366532, "loss": 0.7537, "step": 3180 }, { "epoch": 0.69, "learning_rate": 0.00023263310394784495, "loss": 0.7569, "step": 3200 }, { "epoch": 0.69, "eval_loss": 0.7581906914710999, "eval_runtime": 25.4588, "eval_samples_per_second": 78.558, "eval_steps_per_second": 1.257, "step": 3200 }, { "epoch": 0.69, "learning_rate": 0.0002321984788120246, "loss": 0.7465, "step": 3220 }, { "epoch": 0.7, "learning_rate": 0.00023176385367620425, "loss": 0.7367, "step": 3240 }, { "epoch": 0.7, "learning_rate": 0.0002313292285403839, "loss": 0.7425, "step": 3260 }, { "epoch": 0.71, "learning_rate": 0.00023089460340456353, "loss": 0.7637, "step": 3280 }, { "epoch": 0.71, "learning_rate": 0.0002304599782687432, "loss": 0.7574, "step": 3300 }, { "epoch": 0.72, "learning_rate": 0.00023002535313292283, "loss": 0.7448, "step": 3320 }, { "epoch": 0.72, "learning_rate": 0.00022959072799710246, "loss": 0.7595, "step": 3340 }, { "epoch": 0.72, "learning_rate": 0.00022915610286128213, "loss": 0.7465, "step": 3360 }, { "epoch": 0.73, "learning_rate": 0.00022872147772546176, "loss": 0.7532, "step": 3380 }, { "epoch": 0.73, "learning_rate": 0.0002282868525896414, "loss": 0.7466, "step": 3400 }, { "epoch": 0.73, "eval_loss": 0.7559078931808472, "eval_runtime": 25.464, "eval_samples_per_second": 78.542, "eval_steps_per_second": 1.257, "step": 3400 }, { "epoch": 0.74, "learning_rate": 0.00022785222745382106, "loss": 0.753, "step": 3420 }, { "epoch": 0.74, "learning_rate": 0.0002274176023180007, "loss": 0.7459, "step": 3440 }, { "epoch": 0.75, "learning_rate": 0.00022698297718218034, "loss": 0.7519, "step": 3460 }, { "epoch": 0.75, "learning_rate": 0.00022654835204636, "loss": 0.7451, "step": 3480 }, { "epoch": 0.76, "learning_rate": 0.00022611372691053964, "loss": 0.7468, "step": 3500 }, { "epoch": 0.76, "learning_rate": 0.00022567910177471927, "loss": 0.7491, "step": 3520 }, { "epoch": 0.76, "learning_rate": 0.00022524447663889894, "loss": 0.7524, "step": 3540 }, { "epoch": 0.77, "learning_rate": 0.00022480985150307857, "loss": 0.7484, "step": 3560 }, { "epoch": 0.77, "learning_rate": 0.0002243752263672582, "loss": 0.7484, "step": 3580 }, { "epoch": 0.78, "learning_rate": 0.00022394060123143787, "loss": 0.7529, "step": 3600 }, { "epoch": 0.78, "eval_loss": 0.7531791925430298, "eval_runtime": 25.4572, "eval_samples_per_second": 78.563, "eval_steps_per_second": 1.257, "step": 3600 }, { "epoch": 0.78, "learning_rate": 0.0002235059760956175, "loss": 0.7475, "step": 3620 }, { "epoch": 0.79, "learning_rate": 0.00022307135095979715, "loss": 0.7518, "step": 3640 }, { "epoch": 0.79, "learning_rate": 0.0002226367258239768, "loss": 0.751, "step": 3660 }, { "epoch": 0.79, "learning_rate": 0.00022220210068815645, "loss": 0.7402, "step": 3680 }, { "epoch": 0.8, "learning_rate": 0.00022176747555233608, "loss": 0.755, "step": 3700 }, { "epoch": 0.8, "learning_rate": 0.00022133285041651575, "loss": 0.7441, "step": 3720 }, { "epoch": 0.81, "learning_rate": 0.00022089822528069538, "loss": 0.746, "step": 3740 }, { "epoch": 0.81, "learning_rate": 0.00022046360014487502, "loss": 0.7441, "step": 3760 }, { "epoch": 0.82, "learning_rate": 0.00022002897500905468, "loss": 0.7475, "step": 3780 }, { "epoch": 0.82, "learning_rate": 0.00021959434987323432, "loss": 0.7458, "step": 3800 }, { "epoch": 0.82, "eval_loss": 0.7513870596885681, "eval_runtime": 25.4906, "eval_samples_per_second": 78.46, "eval_steps_per_second": 1.255, "step": 3800 }, { "epoch": 0.82, "learning_rate": 0.00021915972473741396, "loss": 0.7436, "step": 3820 }, { "epoch": 0.83, "learning_rate": 0.00021872509960159362, "loss": 0.7451, "step": 3840 }, { "epoch": 0.83, "learning_rate": 0.00021829047446577326, "loss": 0.7475, "step": 3860 }, { "epoch": 0.84, "learning_rate": 0.0002178558493299529, "loss": 0.7424, "step": 3880 }, { "epoch": 0.84, "learning_rate": 0.00021742122419413256, "loss": 0.7503, "step": 3900 }, { "epoch": 0.85, "learning_rate": 0.0002169865990583122, "loss": 0.7334, "step": 3920 }, { "epoch": 0.85, "learning_rate": 0.00021655197392249183, "loss": 0.7436, "step": 3940 }, { "epoch": 0.85, "learning_rate": 0.0002161173487866715, "loss": 0.7453, "step": 3960 }, { "epoch": 0.86, "learning_rate": 0.00021568272365085113, "loss": 0.7424, "step": 3980 }, { "epoch": 0.86, "learning_rate": 0.00021524809851503076, "loss": 0.7509, "step": 4000 }, { "epoch": 0.86, "eval_loss": 0.7488968968391418, "eval_runtime": 25.492, "eval_samples_per_second": 78.456, "eval_steps_per_second": 1.255, "step": 4000 }, { "epoch": 0.87, "learning_rate": 0.00021481347337921043, "loss": 0.7445, "step": 4020 }, { "epoch": 0.87, "learning_rate": 0.00021437884824339006, "loss": 0.74, "step": 4040 }, { "epoch": 0.88, "learning_rate": 0.0002139442231075697, "loss": 0.7362, "step": 4060 }, { "epoch": 0.88, "learning_rate": 0.00021350959797174936, "loss": 0.7409, "step": 4080 }, { "epoch": 0.88, "learning_rate": 0.000213074972835929, "loss": 0.7315, "step": 4100 }, { "epoch": 0.89, "learning_rate": 0.00021264034770010864, "loss": 0.7488, "step": 4120 }, { "epoch": 0.89, "learning_rate": 0.0002122057225642883, "loss": 0.7375, "step": 4140 }, { "epoch": 0.9, "learning_rate": 0.00021177109742846794, "loss": 0.7481, "step": 4160 }, { "epoch": 0.9, "learning_rate": 0.00021133647229264757, "loss": 0.7524, "step": 4180 }, { "epoch": 0.91, "learning_rate": 0.00021092357841361823, "loss": 0.7403, "step": 4200 }, { "epoch": 0.91, "eval_loss": 0.7469983100891113, "eval_runtime": 25.4847, "eval_samples_per_second": 78.479, "eval_steps_per_second": 1.256, "step": 4200 }, { "epoch": 0.91, "learning_rate": 0.00021048895327779787, "loss": 0.7394, "step": 4220 }, { "epoch": 0.91, "learning_rate": 0.0002100543281419775, "loss": 0.7405, "step": 4240 }, { "epoch": 0.92, "learning_rate": 0.00020961970300615717, "loss": 0.7534, "step": 4260 }, { "epoch": 0.92, "learning_rate": 0.0002091850778703368, "loss": 0.7412, "step": 4280 }, { "epoch": 0.93, "learning_rate": 0.00020875045273451644, "loss": 0.7393, "step": 4300 }, { "epoch": 0.93, "learning_rate": 0.0002083158275986961, "loss": 0.7289, "step": 4320 }, { "epoch": 0.94, "learning_rate": 0.00020788120246287574, "loss": 0.7342, "step": 4340 }, { "epoch": 0.94, "learning_rate": 0.00020744657732705538, "loss": 0.7427, "step": 4360 }, { "epoch": 0.94, "learning_rate": 0.00020701195219123504, "loss": 0.7386, "step": 4380 }, { "epoch": 0.95, "learning_rate": 0.00020657732705541468, "loss": 0.7374, "step": 4400 }, { "epoch": 0.95, "eval_loss": 0.7451291680335999, "eval_runtime": 25.461, "eval_samples_per_second": 78.552, "eval_steps_per_second": 1.257, "step": 4400 }, { "epoch": 0.95, "learning_rate": 0.0002061427019195943, "loss": 0.7364, "step": 4420 }, { "epoch": 0.96, "learning_rate": 0.00020570807678377398, "loss": 0.7377, "step": 4440 }, { "epoch": 0.96, "learning_rate": 0.0002052734516479536, "loss": 0.7391, "step": 4460 }, { "epoch": 0.97, "learning_rate": 0.00020483882651213325, "loss": 0.731, "step": 4480 }, { "epoch": 0.97, "learning_rate": 0.0002044042013763129, "loss": 0.735, "step": 4500 }, { "epoch": 0.98, "learning_rate": 0.00020396957624049255, "loss": 0.7344, "step": 4520 }, { "epoch": 0.98, "learning_rate": 0.00020353495110467219, "loss": 0.7355, "step": 4540 }, { "epoch": 0.98, "learning_rate": 0.00020310032596885185, "loss": 0.7357, "step": 4560 }, { "epoch": 0.99, "learning_rate": 0.00020266570083303149, "loss": 0.7377, "step": 4580 }, { "epoch": 0.99, "learning_rate": 0.00020223107569721112, "loss": 0.7438, "step": 4600 }, { "epoch": 0.99, "eval_loss": 0.7437875270843506, "eval_runtime": 25.5255, "eval_samples_per_second": 78.353, "eval_steps_per_second": 1.254, "step": 4600 }, { "epoch": 1.0, "learning_rate": 0.00020179645056139079, "loss": 0.7343, "step": 4620 }, { "epoch": 1.0, "learning_rate": 0.00020136182542557042, "loss": 0.7473, "step": 4640 }, { "epoch": 1.01, "learning_rate": 0.00020092720028975006, "loss": 0.7305, "step": 4660 }, { "epoch": 1.01, "learning_rate": 0.00020049257515392972, "loss": 0.7284, "step": 4680 }, { "epoch": 1.01, "learning_rate": 0.00020005795001810936, "loss": 0.7335, "step": 4700 }, { "epoch": 1.02, "learning_rate": 0.000199623324882289, "loss": 0.7282, "step": 4720 }, { "epoch": 1.02, "learning_rate": 0.00019918869974646866, "loss": 0.7337, "step": 4740 }, { "epoch": 1.03, "learning_rate": 0.0001987540746106483, "loss": 0.7195, "step": 4760 }, { "epoch": 1.03, "learning_rate": 0.00019831944947482793, "loss": 0.7327, "step": 4780 }, { "epoch": 1.04, "learning_rate": 0.0001978848243390076, "loss": 0.7259, "step": 4800 }, { "epoch": 1.04, "eval_loss": 0.7413464188575745, "eval_runtime": 25.4959, "eval_samples_per_second": 78.444, "eval_steps_per_second": 1.255, "step": 4800 }, { "epoch": 1.04, "learning_rate": 0.00019745019920318723, "loss": 0.7263, "step": 4820 }, { "epoch": 1.04, "learning_rate": 0.00019701557406736687, "loss": 0.7341, "step": 4840 }, { "epoch": 1.05, "learning_rate": 0.00019658094893154653, "loss": 0.7406, "step": 4860 }, { "epoch": 1.05, "learning_rate": 0.00019614632379572617, "loss": 0.7309, "step": 4880 }, { "epoch": 1.06, "learning_rate": 0.0001957116986599058, "loss": 0.7274, "step": 4900 }, { "epoch": 1.06, "learning_rate": 0.00019527707352408547, "loss": 0.7241, "step": 4920 }, { "epoch": 1.07, "learning_rate": 0.0001948424483882651, "loss": 0.7368, "step": 4940 }, { "epoch": 1.07, "learning_rate": 0.00019440782325244474, "loss": 0.7445, "step": 4960 }, { "epoch": 1.07, "learning_rate": 0.0001939731981166244, "loss": 0.7347, "step": 4980 }, { "epoch": 1.08, "learning_rate": 0.00019353857298080404, "loss": 0.7436, "step": 5000 }, { "epoch": 1.08, "eval_loss": 0.7399871945381165, "eval_runtime": 25.5032, "eval_samples_per_second": 78.422, "eval_steps_per_second": 1.255, "step": 5000 }, { "epoch": 1.08, "learning_rate": 0.00019310394784498368, "loss": 0.7248, "step": 5020 }, { "epoch": 1.09, "learning_rate": 0.00019266932270916334, "loss": 0.7374, "step": 5040 }, { "epoch": 1.09, "learning_rate": 0.00019223469757334298, "loss": 0.7187, "step": 5060 }, { "epoch": 1.1, "learning_rate": 0.00019180007243752261, "loss": 0.7381, "step": 5080 }, { "epoch": 1.1, "learning_rate": 0.00019136544730170228, "loss": 0.7389, "step": 5100 }, { "epoch": 1.1, "learning_rate": 0.00019093082216588191, "loss": 0.7343, "step": 5120 }, { "epoch": 1.11, "learning_rate": 0.00019049619703006155, "loss": 0.7323, "step": 5140 }, { "epoch": 1.11, "learning_rate": 0.00019006157189424121, "loss": 0.723, "step": 5160 }, { "epoch": 1.12, "learning_rate": 0.00018962694675842085, "loss": 0.7236, "step": 5180 }, { "epoch": 1.12, "learning_rate": 0.0001891923216226005, "loss": 0.7399, "step": 5200 }, { "epoch": 1.12, "eval_loss": 0.7393975257873535, "eval_runtime": 25.6137, "eval_samples_per_second": 78.083, "eval_steps_per_second": 1.249, "step": 5200 }, { "epoch": 1.13, "learning_rate": 0.00018875769648678015, "loss": 0.7373, "step": 5220 }, { "epoch": 1.13, "learning_rate": 0.0001883230713509598, "loss": 0.7257, "step": 5240 }, { "epoch": 1.13, "learning_rate": 0.00018788844621513942, "loss": 0.7261, "step": 5260 }, { "epoch": 1.14, "learning_rate": 0.0001874538210793191, "loss": 0.7302, "step": 5280 }, { "epoch": 1.14, "learning_rate": 0.00018701919594349872, "loss": 0.7337, "step": 5300 }, { "epoch": 1.15, "learning_rate": 0.00018658457080767836, "loss": 0.7237, "step": 5320 }, { "epoch": 1.15, "learning_rate": 0.00018614994567185802, "loss": 0.7238, "step": 5340 }, { "epoch": 1.16, "learning_rate": 0.00018571532053603766, "loss": 0.7287, "step": 5360 }, { "epoch": 1.16, "learning_rate": 0.0001852806954002173, "loss": 0.7237, "step": 5380 }, { "epoch": 1.17, "learning_rate": 0.00018484607026439696, "loss": 0.7256, "step": 5400 }, { "epoch": 1.17, "eval_loss": 0.7377527952194214, "eval_runtime": 25.4964, "eval_samples_per_second": 78.442, "eval_steps_per_second": 1.255, "step": 5400 }, { "epoch": 1.17, "learning_rate": 0.0001844114451285766, "loss": 0.7279, "step": 5420 }, { "epoch": 1.17, "learning_rate": 0.00018397681999275623, "loss": 0.7226, "step": 5440 }, { "epoch": 1.18, "learning_rate": 0.0001835421948569359, "loss": 0.7167, "step": 5460 }, { "epoch": 1.18, "learning_rate": 0.00018310756972111553, "loss": 0.7268, "step": 5480 }, { "epoch": 1.19, "learning_rate": 0.00018267294458529517, "loss": 0.7398, "step": 5500 }, { "epoch": 1.19, "learning_rate": 0.00018223831944947483, "loss": 0.7331, "step": 5520 }, { "epoch": 1.2, "learning_rate": 0.00018180369431365447, "loss": 0.7372, "step": 5540 }, { "epoch": 1.2, "learning_rate": 0.0001813690691778341, "loss": 0.7321, "step": 5560 }, { "epoch": 1.2, "learning_rate": 0.00018093444404201377, "loss": 0.7346, "step": 5580 }, { "epoch": 1.21, "learning_rate": 0.0001804998189061934, "loss": 0.722, "step": 5600 }, { "epoch": 1.21, "eval_loss": 0.7368175983428955, "eval_runtime": 25.5045, "eval_samples_per_second": 78.417, "eval_steps_per_second": 1.255, "step": 5600 }, { "epoch": 1.21, "learning_rate": 0.00018006519377037304, "loss": 0.7279, "step": 5620 }, { "epoch": 1.22, "learning_rate": 0.0001796305686345527, "loss": 0.72, "step": 5640 }, { "epoch": 1.22, "learning_rate": 0.00017919594349873234, "loss": 0.7295, "step": 5660 }, { "epoch": 1.23, "learning_rate": 0.00017876131836291198, "loss": 0.7245, "step": 5680 }, { "epoch": 1.23, "learning_rate": 0.00017832669322709164, "loss": 0.7418, "step": 5700 }, { "epoch": 1.23, "learning_rate": 0.00017789206809127128, "loss": 0.7317, "step": 5720 }, { "epoch": 1.24, "learning_rate": 0.00017745744295545092, "loss": 0.7303, "step": 5740 }, { "epoch": 1.24, "learning_rate": 0.00017702281781963058, "loss": 0.7332, "step": 5760 }, { "epoch": 1.25, "learning_rate": 0.00017658819268381022, "loss": 0.7202, "step": 5780 }, { "epoch": 1.25, "learning_rate": 0.00017615356754798983, "loss": 0.7238, "step": 5800 }, { "epoch": 1.25, "eval_loss": 0.7348505854606628, "eval_runtime": 25.509, "eval_samples_per_second": 78.404, "eval_steps_per_second": 1.254, "step": 5800 }, { "epoch": 1.26, "learning_rate": 0.00017571894241216946, "loss": 0.724, "step": 5820 }, { "epoch": 1.26, "learning_rate": 0.00017528431727634913, "loss": 0.7258, "step": 5840 }, { "epoch": 1.26, "learning_rate": 0.00017484969214052876, "loss": 0.7217, "step": 5860 }, { "epoch": 1.27, "learning_rate": 0.0001744150670047084, "loss": 0.7209, "step": 5880 }, { "epoch": 1.27, "learning_rate": 0.00017398044186888806, "loss": 0.7276, "step": 5900 }, { "epoch": 1.28, "learning_rate": 0.0001735458167330677, "loss": 0.7287, "step": 5920 }, { "epoch": 1.28, "learning_rate": 0.00017311119159724733, "loss": 0.7244, "step": 5940 }, { "epoch": 1.29, "learning_rate": 0.000172676566461427, "loss": 0.7247, "step": 5960 }, { "epoch": 1.29, "learning_rate": 0.00017224194132560663, "loss": 0.7191, "step": 5980 }, { "epoch": 1.29, "learning_rate": 0.00017180731618978627, "loss": 0.7208, "step": 6000 }, { "epoch": 1.29, "eval_loss": 0.7340711951255798, "eval_runtime": 25.4669, "eval_samples_per_second": 78.533, "eval_steps_per_second": 1.257, "step": 6000 }, { "epoch": 1.3, "learning_rate": 0.00017137269105396593, "loss": 0.7285, "step": 6020 }, { "epoch": 1.3, "learning_rate": 0.00017093806591814557, "loss": 0.7294, "step": 6040 }, { "epoch": 1.31, "learning_rate": 0.0001705034407823252, "loss": 0.7365, "step": 6060 }, { "epoch": 1.31, "learning_rate": 0.00017006881564650487, "loss": 0.7149, "step": 6080 }, { "epoch": 1.32, "learning_rate": 0.0001696341905106845, "loss": 0.7229, "step": 6100 }, { "epoch": 1.32, "learning_rate": 0.00016919956537486414, "loss": 0.7253, "step": 6120 }, { "epoch": 1.32, "learning_rate": 0.0001687649402390438, "loss": 0.7188, "step": 6140 }, { "epoch": 1.33, "learning_rate": 0.00016833031510322344, "loss": 0.7308, "step": 6160 }, { "epoch": 1.33, "learning_rate": 0.00016789568996740308, "loss": 0.7186, "step": 6180 }, { "epoch": 1.34, "learning_rate": 0.00016746106483158274, "loss": 0.7121, "step": 6200 }, { "epoch": 1.34, "eval_loss": 0.7324739694595337, "eval_runtime": 25.5, "eval_samples_per_second": 78.431, "eval_steps_per_second": 1.255, "step": 6200 }, { "epoch": 1.34, "learning_rate": 0.00016702643969576238, "loss": 0.7286, "step": 6220 }, { "epoch": 1.35, "learning_rate": 0.00016659181455994202, "loss": 0.7246, "step": 6240 }, { "epoch": 1.35, "learning_rate": 0.00016615718942412168, "loss": 0.7234, "step": 6260 }, { "epoch": 1.35, "learning_rate": 0.00016572256428830132, "loss": 0.7245, "step": 6280 }, { "epoch": 1.36, "learning_rate": 0.00016528793915248095, "loss": 0.7252, "step": 6300 }, { "epoch": 1.36, "learning_rate": 0.00016485331401666062, "loss": 0.7259, "step": 6320 }, { "epoch": 1.37, "learning_rate": 0.00016441868888084025, "loss": 0.7173, "step": 6340 }, { "epoch": 1.37, "learning_rate": 0.0001639840637450199, "loss": 0.7222, "step": 6360 }, { "epoch": 1.38, "learning_rate": 0.00016354943860919955, "loss": 0.7113, "step": 6380 }, { "epoch": 1.38, "learning_rate": 0.0001631148134733792, "loss": 0.72, "step": 6400 }, { "epoch": 1.38, "eval_loss": 0.7319995164871216, "eval_runtime": 25.5112, "eval_samples_per_second": 78.397, "eval_steps_per_second": 1.254, "step": 6400 }, { "epoch": 1.39, "learning_rate": 0.00016268018833755883, "loss": 0.7333, "step": 6420 }, { "epoch": 1.39, "learning_rate": 0.0001622455632017385, "loss": 0.7208, "step": 6440 }, { "epoch": 1.39, "learning_rate": 0.00016181093806591813, "loss": 0.7161, "step": 6460 }, { "epoch": 1.4, "learning_rate": 0.00016137631293009776, "loss": 0.7171, "step": 6480 }, { "epoch": 1.4, "learning_rate": 0.00016094168779427743, "loss": 0.7297, "step": 6500 }, { "epoch": 1.41, "learning_rate": 0.00016050706265845706, "loss": 0.7156, "step": 6520 }, { "epoch": 1.41, "learning_rate": 0.0001600724375226367, "loss": 0.7175, "step": 6540 }, { "epoch": 1.42, "learning_rate": 0.00015963781238681636, "loss": 0.7152, "step": 6560 }, { "epoch": 1.42, "learning_rate": 0.000159203187250996, "loss": 0.7282, "step": 6580 }, { "epoch": 1.42, "learning_rate": 0.00015876856211517564, "loss": 0.722, "step": 6600 }, { "epoch": 1.42, "eval_loss": 0.7307416796684265, "eval_runtime": 25.4967, "eval_samples_per_second": 78.442, "eval_steps_per_second": 1.255, "step": 6600 }, { "epoch": 1.43, "learning_rate": 0.0001583339369793553, "loss": 0.7274, "step": 6620 }, { "epoch": 1.43, "learning_rate": 0.00015789931184353494, "loss": 0.7313, "step": 6640 }, { "epoch": 1.44, "learning_rate": 0.00015746468670771457, "loss": 0.7209, "step": 6660 }, { "epoch": 1.44, "learning_rate": 0.00015703006157189424, "loss": 0.7202, "step": 6680 }, { "epoch": 1.45, "learning_rate": 0.00015659543643607387, "loss": 0.7264, "step": 6700 }, { "epoch": 1.45, "learning_rate": 0.0001561608113002535, "loss": 0.7226, "step": 6720 }, { "epoch": 1.45, "learning_rate": 0.00015572618616443317, "loss": 0.711, "step": 6740 }, { "epoch": 1.46, "learning_rate": 0.0001552915610286128, "loss": 0.7216, "step": 6760 }, { "epoch": 1.46, "learning_rate": 0.00015485693589279245, "loss": 0.7184, "step": 6780 }, { "epoch": 1.47, "learning_rate": 0.0001544223107569721, "loss": 0.7216, "step": 6800 }, { "epoch": 1.47, "eval_loss": 0.7297094464302063, "eval_runtime": 25.4826, "eval_samples_per_second": 78.485, "eval_steps_per_second": 1.256, "step": 6800 }, { "epoch": 1.47, "learning_rate": 0.00015398768562115175, "loss": 0.7203, "step": 6820 }, { "epoch": 1.48, "learning_rate": 0.00015355306048533138, "loss": 0.7184, "step": 6840 }, { "epoch": 1.48, "learning_rate": 0.00015311843534951105, "loss": 0.7183, "step": 6860 }, { "epoch": 1.48, "learning_rate": 0.00015268381021369068, "loss": 0.7267, "step": 6880 }, { "epoch": 1.49, "learning_rate": 0.00015224918507787032, "loss": 0.7299, "step": 6900 }, { "epoch": 1.49, "learning_rate": 0.00015181455994204998, "loss": 0.719, "step": 6920 }, { "epoch": 1.5, "learning_rate": 0.00015137993480622962, "loss": 0.7229, "step": 6940 }, { "epoch": 1.5, "learning_rate": 0.00015094530967040926, "loss": 0.7231, "step": 6960 }, { "epoch": 1.51, "learning_rate": 0.00015051068453458892, "loss": 0.7279, "step": 6980 }, { "epoch": 1.51, "learning_rate": 0.00015007605939876856, "loss": 0.7252, "step": 7000 }, { "epoch": 1.51, "eval_loss": 0.7288112640380859, "eval_runtime": 25.4887, "eval_samples_per_second": 78.466, "eval_steps_per_second": 1.255, "step": 7000 }, { "epoch": 1.51, "learning_rate": 0.0001496414342629482, "loss": 0.7148, "step": 7020 }, { "epoch": 1.52, "learning_rate": 0.00014920680912712786, "loss": 0.7147, "step": 7040 }, { "epoch": 1.52, "learning_rate": 0.0001487721839913075, "loss": 0.7209, "step": 7060 }, { "epoch": 1.53, "learning_rate": 0.00014833755885548713, "loss": 0.724, "step": 7080 }, { "epoch": 1.53, "learning_rate": 0.00014790293371966676, "loss": 0.7256, "step": 7100 }, { "epoch": 1.54, "learning_rate": 0.0001474683085838464, "loss": 0.7246, "step": 7120 }, { "epoch": 1.54, "learning_rate": 0.00014703368344802606, "loss": 0.7103, "step": 7140 }, { "epoch": 1.54, "learning_rate": 0.0001465990583122057, "loss": 0.7223, "step": 7160 }, { "epoch": 1.55, "learning_rate": 0.00014616443317638534, "loss": 0.7149, "step": 7180 }, { "epoch": 1.55, "learning_rate": 0.000145729808040565, "loss": 0.7214, "step": 7200 }, { "epoch": 1.55, "eval_loss": 0.7280930876731873, "eval_runtime": 25.4883, "eval_samples_per_second": 78.467, "eval_steps_per_second": 1.255, "step": 7200 }, { "epoch": 1.56, "learning_rate": 0.00014529518290474464, "loss": 0.7118, "step": 7220 }, { "epoch": 1.56, "learning_rate": 0.00014486055776892427, "loss": 0.7171, "step": 7240 }, { "epoch": 1.57, "learning_rate": 0.00014442593263310394, "loss": 0.7191, "step": 7260 }, { "epoch": 1.57, "learning_rate": 0.00014399130749728357, "loss": 0.7155, "step": 7280 }, { "epoch": 1.57, "learning_rate": 0.0001435566823614632, "loss": 0.7198, "step": 7300 }, { "epoch": 1.58, "learning_rate": 0.00014312205722564287, "loss": 0.7188, "step": 7320 }, { "epoch": 1.58, "learning_rate": 0.0001426874320898225, "loss": 0.7236, "step": 7340 }, { "epoch": 1.59, "learning_rate": 0.00014225280695400215, "loss": 0.712, "step": 7360 }, { "epoch": 1.59, "learning_rate": 0.0001418181818181818, "loss": 0.7181, "step": 7380 }, { "epoch": 1.6, "learning_rate": 0.00014138355668236145, "loss": 0.7198, "step": 7400 }, { "epoch": 1.6, "eval_loss": 0.7276077270507812, "eval_runtime": 25.4843, "eval_samples_per_second": 78.48, "eval_steps_per_second": 1.256, "step": 7400 }, { "epoch": 1.6, "learning_rate": 0.00014094893154654108, "loss": 0.7187, "step": 7420 }, { "epoch": 1.61, "learning_rate": 0.00014051430641072075, "loss": 0.7153, "step": 7440 }, { "epoch": 1.61, "learning_rate": 0.00014007968127490038, "loss": 0.7208, "step": 7460 }, { "epoch": 1.61, "learning_rate": 0.00013964505613908002, "loss": 0.7153, "step": 7480 }, { "epoch": 1.62, "learning_rate": 0.00013921043100325968, "loss": 0.7207, "step": 7500 }, { "epoch": 1.62, "learning_rate": 0.00013877580586743932, "loss": 0.7167, "step": 7520 }, { "epoch": 1.63, "learning_rate": 0.00013834118073161896, "loss": 0.7183, "step": 7540 }, { "epoch": 1.63, "learning_rate": 0.00013792828685258964, "loss": 0.7196, "step": 7560 }, { "epoch": 1.64, "learning_rate": 0.00013749366171676928, "loss": 0.7233, "step": 7580 }, { "epoch": 1.64, "learning_rate": 0.00013705903658094894, "loss": 0.7237, "step": 7600 }, { "epoch": 1.64, "eval_loss": 0.7260885238647461, "eval_runtime": 25.503, "eval_samples_per_second": 78.422, "eval_steps_per_second": 1.255, "step": 7600 }, { "epoch": 1.64, "learning_rate": 0.00013662441144512855, "loss": 0.72, "step": 7620 }, { "epoch": 1.65, "learning_rate": 0.0001361897863093082, "loss": 0.7094, "step": 7640 }, { "epoch": 1.65, "learning_rate": 0.00013575516117348785, "loss": 0.7111, "step": 7660 }, { "epoch": 1.66, "learning_rate": 0.00013532053603766749, "loss": 0.7182, "step": 7680 }, { "epoch": 1.66, "learning_rate": 0.00013488591090184715, "loss": 0.7182, "step": 7700 }, { "epoch": 1.67, "learning_rate": 0.00013445128576602679, "loss": 0.7183, "step": 7720 }, { "epoch": 1.67, "learning_rate": 0.00013401666063020642, "loss": 0.7112, "step": 7740 }, { "epoch": 1.67, "learning_rate": 0.00013358203549438609, "loss": 0.7183, "step": 7760 }, { "epoch": 1.68, "learning_rate": 0.00013314741035856572, "loss": 0.7152, "step": 7780 }, { "epoch": 1.68, "learning_rate": 0.00013271278522274536, "loss": 0.7233, "step": 7800 }, { "epoch": 1.68, "eval_loss": 0.7252987027168274, "eval_runtime": 25.5066, "eval_samples_per_second": 78.411, "eval_steps_per_second": 1.255, "step": 7800 }, { "epoch": 1.69, "learning_rate": 0.00013227816008692502, "loss": 0.7124, "step": 7820 }, { "epoch": 1.69, "learning_rate": 0.00013184353495110466, "loss": 0.7109, "step": 7840 }, { "epoch": 1.7, "learning_rate": 0.0001314089098152843, "loss": 0.7132, "step": 7860 }, { "epoch": 1.7, "learning_rate": 0.00013097428467946396, "loss": 0.7157, "step": 7880 }, { "epoch": 1.7, "learning_rate": 0.0001305396595436436, "loss": 0.7237, "step": 7900 }, { "epoch": 1.71, "learning_rate": 0.00013010503440782323, "loss": 0.7176, "step": 7920 }, { "epoch": 1.71, "learning_rate": 0.0001296704092720029, "loss": 0.7199, "step": 7940 }, { "epoch": 1.72, "learning_rate": 0.00012923578413618253, "loss": 0.7119, "step": 7960 }, { "epoch": 1.72, "learning_rate": 0.00012880115900036217, "loss": 0.717, "step": 7980 }, { "epoch": 1.73, "learning_rate": 0.00012836653386454183, "loss": 0.7155, "step": 8000 }, { "epoch": 1.73, "eval_loss": 0.7248360514640808, "eval_runtime": 25.5301, "eval_samples_per_second": 78.339, "eval_steps_per_second": 1.253, "step": 8000 }, { "epoch": 1.73, "learning_rate": 0.00012793190872872147, "loss": 0.7085, "step": 8020 }, { "epoch": 1.73, "learning_rate": 0.0001274972835929011, "loss": 0.7174, "step": 8040 }, { "epoch": 1.74, "learning_rate": 0.00012706265845708077, "loss": 0.7224, "step": 8060 }, { "epoch": 1.74, "learning_rate": 0.0001266280333212604, "loss": 0.7169, "step": 8080 }, { "epoch": 1.75, "learning_rate": 0.00012619340818544004, "loss": 0.7191, "step": 8100 }, { "epoch": 1.75, "learning_rate": 0.0001257587830496197, "loss": 0.7179, "step": 8120 }, { "epoch": 1.76, "learning_rate": 0.00012532415791379934, "loss": 0.7208, "step": 8140 }, { "epoch": 1.76, "learning_rate": 0.00012488953277797898, "loss": 0.7168, "step": 8160 }, { "epoch": 1.76, "learning_rate": 0.00012445490764215864, "loss": 0.7101, "step": 8180 }, { "epoch": 1.77, "learning_rate": 0.00012402028250633828, "loss": 0.7167, "step": 8200 }, { "epoch": 1.77, "eval_loss": 0.7242170572280884, "eval_runtime": 25.4873, "eval_samples_per_second": 78.47, "eval_steps_per_second": 1.256, "step": 8200 }, { "epoch": 1.77, "learning_rate": 0.00012358565737051791, "loss": 0.7062, "step": 8220 }, { "epoch": 1.78, "learning_rate": 0.00012315103223469758, "loss": 0.7177, "step": 8240 }, { "epoch": 1.78, "learning_rate": 0.00012271640709887721, "loss": 0.7035, "step": 8260 }, { "epoch": 1.79, "learning_rate": 0.00012228178196305685, "loss": 0.7157, "step": 8280 }, { "epoch": 1.79, "learning_rate": 0.0001218471568272365, "loss": 0.7196, "step": 8300 }, { "epoch": 1.8, "learning_rate": 0.00012141253169141615, "loss": 0.7105, "step": 8320 }, { "epoch": 1.8, "learning_rate": 0.00012097790655559579, "loss": 0.7105, "step": 8340 }, { "epoch": 1.8, "learning_rate": 0.00012054328141977544, "loss": 0.7139, "step": 8360 }, { "epoch": 1.81, "learning_rate": 0.00012010865628395509, "loss": 0.7215, "step": 8380 }, { "epoch": 1.81, "learning_rate": 0.00011967403114813472, "loss": 0.725, "step": 8400 }, { "epoch": 1.81, "eval_loss": 0.7237139344215393, "eval_runtime": 25.506, "eval_samples_per_second": 78.413, "eval_steps_per_second": 1.255, "step": 8400 }, { "epoch": 1.82, "learning_rate": 0.00011923940601231437, "loss": 0.7107, "step": 8420 }, { "epoch": 1.82, "learning_rate": 0.00011880478087649402, "loss": 0.7095, "step": 8440 }, { "epoch": 1.83, "learning_rate": 0.00011837015574067366, "loss": 0.7061, "step": 8460 }, { "epoch": 1.83, "learning_rate": 0.0001179355306048533, "loss": 0.716, "step": 8480 }, { "epoch": 1.83, "learning_rate": 0.00011750090546903295, "loss": 0.7203, "step": 8500 }, { "epoch": 1.84, "learning_rate": 0.00011706628033321258, "loss": 0.7098, "step": 8520 }, { "epoch": 1.84, "learning_rate": 0.00011663165519739223, "loss": 0.7104, "step": 8540 }, { "epoch": 1.85, "learning_rate": 0.00011619703006157188, "loss": 0.7051, "step": 8560 }, { "epoch": 1.85, "learning_rate": 0.00011576240492575152, "loss": 0.7198, "step": 8580 }, { "epoch": 1.86, "learning_rate": 0.00011532777978993117, "loss": 0.7175, "step": 8600 }, { "epoch": 1.86, "eval_loss": 0.7230754494667053, "eval_runtime": 25.5133, "eval_samples_per_second": 78.39, "eval_steps_per_second": 1.254, "step": 8600 }, { "epoch": 1.86, "learning_rate": 0.00011489315465411082, "loss": 0.7046, "step": 8620 }, { "epoch": 1.86, "learning_rate": 0.00011445852951829046, "loss": 0.7176, "step": 8640 }, { "epoch": 1.87, "learning_rate": 0.0001140239043824701, "loss": 0.7193, "step": 8660 }, { "epoch": 1.87, "learning_rate": 0.00011358927924664976, "loss": 0.7046, "step": 8680 }, { "epoch": 1.88, "learning_rate": 0.00011315465411082939, "loss": 0.7116, "step": 8700 }, { "epoch": 1.88, "learning_rate": 0.00011274176023180006, "loss": 0.7152, "step": 8720 }, { "epoch": 1.89, "learning_rate": 0.00011230713509597971, "loss": 0.7164, "step": 8740 }, { "epoch": 1.89, "learning_rate": 0.00011187250996015936, "loss": 0.7192, "step": 8760 }, { "epoch": 1.89, "learning_rate": 0.000111437884824339, "loss": 0.7124, "step": 8780 }, { "epoch": 1.9, "learning_rate": 0.00011100325968851865, "loss": 0.7032, "step": 8800 }, { "epoch": 1.9, "eval_loss": 0.7217770218849182, "eval_runtime": 25.4723, "eval_samples_per_second": 78.517, "eval_steps_per_second": 1.256, "step": 8800 }, { "epoch": 1.9, "learning_rate": 0.0001105686345526983, "loss": 0.7157, "step": 8820 }, { "epoch": 1.91, "learning_rate": 0.00011013400941687794, "loss": 0.7115, "step": 8840 }, { "epoch": 1.91, "learning_rate": 0.00010969938428105759, "loss": 0.7137, "step": 8860 }, { "epoch": 1.92, "learning_rate": 0.00010926475914523724, "loss": 0.7176, "step": 8880 }, { "epoch": 1.92, "learning_rate": 0.00010883013400941687, "loss": 0.7081, "step": 8900 }, { "epoch": 1.92, "learning_rate": 0.00010839550887359652, "loss": 0.7233, "step": 8920 }, { "epoch": 1.93, "learning_rate": 0.00010796088373777617, "loss": 0.7058, "step": 8940 }, { "epoch": 1.93, "learning_rate": 0.00010752625860195581, "loss": 0.7154, "step": 8960 }, { "epoch": 1.94, "learning_rate": 0.00010709163346613546, "loss": 0.7135, "step": 8980 }, { "epoch": 1.94, "learning_rate": 0.00010665700833031508, "loss": 0.7078, "step": 9000 }, { "epoch": 1.94, "eval_loss": 0.7215875387191772, "eval_runtime": 25.484, "eval_samples_per_second": 78.481, "eval_steps_per_second": 1.256, "step": 9000 }, { "epoch": 1.95, "learning_rate": 0.00010622238319449473, "loss": 0.7061, "step": 9020 }, { "epoch": 1.95, "learning_rate": 0.00010578775805867438, "loss": 0.7174, "step": 9040 }, { "epoch": 1.95, "learning_rate": 0.00010535313292285402, "loss": 0.7132, "step": 9060 }, { "epoch": 1.96, "learning_rate": 0.00010491850778703367, "loss": 0.7247, "step": 9080 }, { "epoch": 1.96, "learning_rate": 0.00010448388265121332, "loss": 0.7064, "step": 9100 }, { "epoch": 1.97, "learning_rate": 0.00010404925751539295, "loss": 0.7098, "step": 9120 }, { "epoch": 1.97, "learning_rate": 0.0001036146323795726, "loss": 0.708, "step": 9140 }, { "epoch": 1.98, "learning_rate": 0.00010318000724375225, "loss": 0.7144, "step": 9160 }, { "epoch": 1.98, "learning_rate": 0.00010274538210793189, "loss": 0.7151, "step": 9180 }, { "epoch": 1.98, "learning_rate": 0.00010231075697211154, "loss": 0.718, "step": 9200 }, { "epoch": 1.98, "eval_loss": 0.7208251357078552, "eval_runtime": 25.5022, "eval_samples_per_second": 78.425, "eval_steps_per_second": 1.255, "step": 9200 }, { "epoch": 1.99, "learning_rate": 0.00010187613183629119, "loss": 0.7108, "step": 9220 }, { "epoch": 1.99, "learning_rate": 0.00010144150670047083, "loss": 0.6952, "step": 9240 }, { "epoch": 2.0, "learning_rate": 0.00010100688156465048, "loss": 0.7013, "step": 9260 }, { "epoch": 2.0, "learning_rate": 0.00010057225642883013, "loss": 0.7013, "step": 9280 }, { "epoch": 2.01, "learning_rate": 0.00010013763129300976, "loss": 0.7049, "step": 9300 }, { "epoch": 2.01, "learning_rate": 9.970300615718941e-05, "loss": 0.7093, "step": 9320 }, { "epoch": 2.02, "learning_rate": 9.926838102136906e-05, "loss": 0.713, "step": 9340 }, { "epoch": 2.02, "learning_rate": 9.88337558855487e-05, "loss": 0.7108, "step": 9360 }, { "epoch": 2.02, "learning_rate": 9.839913074972835e-05, "loss": 0.7115, "step": 9380 }, { "epoch": 2.03, "learning_rate": 9.7964505613908e-05, "loss": 0.7119, "step": 9400 }, { "epoch": 2.03, "eval_loss": 0.7202969789505005, "eval_runtime": 25.504, "eval_samples_per_second": 78.419, "eval_steps_per_second": 1.255, "step": 9400 }, { "epoch": 2.03, "learning_rate": 9.752988047808764e-05, "loss": 0.7107, "step": 9420 }, { "epoch": 2.04, "learning_rate": 9.709525534226729e-05, "loss": 0.7065, "step": 9440 }, { "epoch": 2.04, "learning_rate": 9.666063020644694e-05, "loss": 0.7121, "step": 9460 }, { "epoch": 2.05, "learning_rate": 9.622600507062657e-05, "loss": 0.7163, "step": 9480 }, { "epoch": 2.05, "learning_rate": 9.579137993480622e-05, "loss": 0.7026, "step": 9500 }, { "epoch": 2.05, "learning_rate": 9.535675479898587e-05, "loss": 0.7158, "step": 9520 }, { "epoch": 2.06, "learning_rate": 9.492212966316551e-05, "loss": 0.7016, "step": 9540 }, { "epoch": 2.06, "learning_rate": 9.448750452734516e-05, "loss": 0.7149, "step": 9560 }, { "epoch": 2.07, "learning_rate": 9.405287939152481e-05, "loss": 0.7079, "step": 9580 }, { "epoch": 2.07, "learning_rate": 9.361825425570445e-05, "loss": 0.709, "step": 9600 }, { "epoch": 2.07, "eval_loss": 0.7194134593009949, "eval_runtime": 25.5286, "eval_samples_per_second": 78.343, "eval_steps_per_second": 1.253, "step": 9600 }, { "epoch": 2.08, "learning_rate": 9.31836291198841e-05, "loss": 0.7127, "step": 9620 }, { "epoch": 2.08, "learning_rate": 9.274900398406375e-05, "loss": 0.7037, "step": 9640 }, { "epoch": 2.08, "learning_rate": 9.231437884824338e-05, "loss": 0.7114, "step": 9660 }, { "epoch": 2.09, "learning_rate": 9.187975371242303e-05, "loss": 0.706, "step": 9680 }, { "epoch": 2.09, "learning_rate": 9.144512857660268e-05, "loss": 0.7026, "step": 9700 }, { "epoch": 2.1, "learning_rate": 9.101050344078232e-05, "loss": 0.7079, "step": 9720 }, { "epoch": 2.1, "learning_rate": 9.057587830496197e-05, "loss": 0.7053, "step": 9740 }, { "epoch": 2.11, "learning_rate": 9.014125316914162e-05, "loss": 0.7125, "step": 9760 }, { "epoch": 2.11, "learning_rate": 8.970662803332126e-05, "loss": 0.7045, "step": 9780 }, { "epoch": 2.11, "learning_rate": 8.92720028975009e-05, "loss": 0.7109, "step": 9800 }, { "epoch": 2.11, "eval_loss": 0.7186465859413147, "eval_runtime": 25.5049, "eval_samples_per_second": 78.416, "eval_steps_per_second": 1.255, "step": 9800 }, { "epoch": 2.12, "learning_rate": 8.883737776168056e-05, "loss": 0.7035, "step": 9820 }, { "epoch": 2.12, "learning_rate": 8.840275262586019e-05, "loss": 0.7073, "step": 9840 }, { "epoch": 2.13, "learning_rate": 8.796812749003983e-05, "loss": 0.7114, "step": 9860 }, { "epoch": 2.13, "learning_rate": 8.753350235421946e-05, "loss": 0.7066, "step": 9880 }, { "epoch": 2.14, "learning_rate": 8.709887721839911e-05, "loss": 0.7055, "step": 9900 }, { "epoch": 2.14, "learning_rate": 8.666425208257877e-05, "loss": 0.7064, "step": 9920 }, { "epoch": 2.14, "learning_rate": 8.62296269467584e-05, "loss": 0.7154, "step": 9940 }, { "epoch": 2.15, "learning_rate": 8.579500181093805e-05, "loss": 0.7099, "step": 9960 }, { "epoch": 2.15, "learning_rate": 8.53603766751177e-05, "loss": 0.7112, "step": 9980 }, { "epoch": 2.16, "learning_rate": 8.492575153929734e-05, "loss": 0.7086, "step": 10000 }, { "epoch": 2.16, "eval_loss": 0.7181739211082458, "eval_runtime": 25.5087, "eval_samples_per_second": 78.405, "eval_steps_per_second": 1.254, "step": 10000 }, { "epoch": 2.16, "learning_rate": 8.449112640347699e-05, "loss": 0.7155, "step": 10020 }, { "epoch": 2.17, "learning_rate": 8.405650126765664e-05, "loss": 0.7097, "step": 10040 }, { "epoch": 2.17, "learning_rate": 8.362187613183627e-05, "loss": 0.7025, "step": 10060 }, { "epoch": 2.17, "learning_rate": 8.318725099601592e-05, "loss": 0.7065, "step": 10080 }, { "epoch": 2.18, "learning_rate": 8.275262586019557e-05, "loss": 0.6982, "step": 10100 }, { "epoch": 2.18, "learning_rate": 8.231800072437521e-05, "loss": 0.7039, "step": 10120 }, { "epoch": 2.19, "learning_rate": 8.188337558855486e-05, "loss": 0.7097, "step": 10140 }, { "epoch": 2.19, "learning_rate": 8.144875045273451e-05, "loss": 0.7089, "step": 10160 }, { "epoch": 2.2, "learning_rate": 8.101412531691415e-05, "loss": 0.7018, "step": 10180 }, { "epoch": 2.2, "learning_rate": 8.05795001810938e-05, "loss": 0.7025, "step": 10200 }, { "epoch": 2.2, "eval_loss": 0.7179592251777649, "eval_runtime": 25.4993, "eval_samples_per_second": 78.433, "eval_steps_per_second": 1.255, "step": 10200 }, { "epoch": 2.2, "learning_rate": 8.014487504527345e-05, "loss": 0.7067, "step": 10220 }, { "epoch": 2.21, "learning_rate": 7.971024990945308e-05, "loss": 0.71, "step": 10240 }, { "epoch": 2.21, "learning_rate": 7.927562477363273e-05, "loss": 0.7255, "step": 10260 }, { "epoch": 2.22, "learning_rate": 7.884099963781238e-05, "loss": 0.7065, "step": 10280 }, { "epoch": 2.22, "learning_rate": 7.840637450199202e-05, "loss": 0.712, "step": 10300 }, { "epoch": 2.23, "learning_rate": 7.797174936617167e-05, "loss": 0.7132, "step": 10320 }, { "epoch": 2.23, "learning_rate": 7.753712423035132e-05, "loss": 0.7106, "step": 10340 }, { "epoch": 2.24, "learning_rate": 7.710249909453096e-05, "loss": 0.708, "step": 10360 }, { "epoch": 2.24, "learning_rate": 7.666787395871061e-05, "loss": 0.7054, "step": 10380 }, { "epoch": 2.24, "learning_rate": 7.623324882289026e-05, "loss": 0.7087, "step": 10400 }, { "epoch": 2.24, "eval_loss": 0.717901349067688, "eval_runtime": 25.4862, "eval_samples_per_second": 78.474, "eval_steps_per_second": 1.256, "step": 10400 }, { "epoch": 2.25, "learning_rate": 7.57986236870699e-05, "loss": 0.7014, "step": 10420 }, { "epoch": 2.25, "learning_rate": 7.536399855124954e-05, "loss": 0.7103, "step": 10440 }, { "epoch": 2.26, "learning_rate": 7.49293734154292e-05, "loss": 0.7089, "step": 10460 }, { "epoch": 2.26, "learning_rate": 7.449474827960883e-05, "loss": 0.704, "step": 10480 }, { "epoch": 2.27, "learning_rate": 7.406012314378847e-05, "loss": 0.7074, "step": 10500 }, { "epoch": 2.27, "learning_rate": 7.362549800796812e-05, "loss": 0.7094, "step": 10520 }, { "epoch": 2.27, "learning_rate": 7.319087287214777e-05, "loss": 0.7069, "step": 10540 }, { "epoch": 2.28, "learning_rate": 7.27562477363274e-05, "loss": 0.7081, "step": 10560 }, { "epoch": 2.28, "learning_rate": 7.232162260050705e-05, "loss": 0.7036, "step": 10580 }, { "epoch": 2.29, "learning_rate": 7.18869974646867e-05, "loss": 0.6984, "step": 10600 }, { "epoch": 2.29, "eval_loss": 0.7175166010856628, "eval_runtime": 25.5016, "eval_samples_per_second": 78.426, "eval_steps_per_second": 1.255, "step": 10600 }, { "epoch": 2.29, "learning_rate": 7.145237232886634e-05, "loss": 0.7097, "step": 10620 }, { "epoch": 2.3, "learning_rate": 7.101774719304599e-05, "loss": 0.7143, "step": 10640 }, { "epoch": 2.3, "learning_rate": 7.058312205722564e-05, "loss": 0.7099, "step": 10660 }, { "epoch": 2.3, "learning_rate": 7.014849692140528e-05, "loss": 0.6994, "step": 10680 }, { "epoch": 2.31, "learning_rate": 6.971387178558493e-05, "loss": 0.7129, "step": 10700 }, { "epoch": 2.31, "learning_rate": 6.927924664976458e-05, "loss": 0.7067, "step": 10720 }, { "epoch": 2.32, "learning_rate": 6.884462151394421e-05, "loss": 0.7044, "step": 10740 }, { "epoch": 2.32, "learning_rate": 6.840999637812386e-05, "loss": 0.7092, "step": 10760 }, { "epoch": 2.33, "learning_rate": 6.797537124230351e-05, "loss": 0.7075, "step": 10780 }, { "epoch": 2.33, "learning_rate": 6.754074610648315e-05, "loss": 0.7073, "step": 10800 }, { "epoch": 2.33, "eval_loss": 0.7168901562690735, "eval_runtime": 25.5153, "eval_samples_per_second": 78.384, "eval_steps_per_second": 1.254, "step": 10800 }, { "epoch": 2.33, "learning_rate": 6.71061209706628e-05, "loss": 0.7088, "step": 10820 }, { "epoch": 2.34, "learning_rate": 6.667149583484245e-05, "loss": 0.7046, "step": 10840 }, { "epoch": 2.34, "learning_rate": 6.623687069902209e-05, "loss": 0.7029, "step": 10860 }, { "epoch": 2.35, "learning_rate": 6.580224556320174e-05, "loss": 0.7055, "step": 10880 }, { "epoch": 2.35, "learning_rate": 6.536762042738139e-05, "loss": 0.7095, "step": 10900 }, { "epoch": 2.36, "learning_rate": 6.493299529156102e-05, "loss": 0.7057, "step": 10920 }, { "epoch": 2.36, "learning_rate": 6.449837015574066e-05, "loss": 0.7064, "step": 10940 }, { "epoch": 2.36, "learning_rate": 6.406374501992031e-05, "loss": 0.7039, "step": 10960 }, { "epoch": 2.37, "learning_rate": 6.362911988409996e-05, "loss": 0.7109, "step": 10980 }, { "epoch": 2.37, "learning_rate": 6.31944947482796e-05, "loss": 0.7051, "step": 11000 }, { "epoch": 2.37, "eval_loss": 0.7164381146430969, "eval_runtime": 25.4817, "eval_samples_per_second": 78.488, "eval_steps_per_second": 1.256, "step": 11000 }, { "epoch": 2.38, "learning_rate": 6.275986961245924e-05, "loss": 0.7117, "step": 11020 }, { "epoch": 2.38, "learning_rate": 6.23252444766389e-05, "loss": 0.6972, "step": 11040 }, { "epoch": 2.39, "learning_rate": 6.189061934081853e-05, "loss": 0.7087, "step": 11060 }, { "epoch": 2.39, "learning_rate": 6.145599420499818e-05, "loss": 0.703, "step": 11080 }, { "epoch": 2.39, "learning_rate": 6.1021369069177825e-05, "loss": 0.7062, "step": 11100 }, { "epoch": 2.4, "learning_rate": 6.0586743933357475e-05, "loss": 0.7018, "step": 11120 }, { "epoch": 2.4, "learning_rate": 6.015211879753712e-05, "loss": 0.7003, "step": 11140 }, { "epoch": 2.41, "learning_rate": 5.971749366171676e-05, "loss": 0.7005, "step": 11160 }, { "epoch": 2.41, "learning_rate": 5.928286852589641e-05, "loss": 0.7099, "step": 11180 }, { "epoch": 2.42, "learning_rate": 5.8848243390076054e-05, "loss": 0.7002, "step": 11200 }, { "epoch": 2.42, "eval_loss": 0.7161288857460022, "eval_runtime": 25.5084, "eval_samples_per_second": 78.406, "eval_steps_per_second": 1.254, "step": 11200 }, { "epoch": 2.42, "learning_rate": 5.84136182542557e-05, "loss": 0.7071, "step": 11220 }, { "epoch": 2.43, "learning_rate": 5.797899311843535e-05, "loss": 0.7028, "step": 11240 }, { "epoch": 2.43, "learning_rate": 5.754436798261499e-05, "loss": 0.7199, "step": 11260 }, { "epoch": 2.43, "learning_rate": 5.7109742846794634e-05, "loss": 0.6974, "step": 11280 }, { "epoch": 2.44, "learning_rate": 5.6675117710974284e-05, "loss": 0.7003, "step": 11300 }, { "epoch": 2.44, "learning_rate": 5.624049257515393e-05, "loss": 0.7079, "step": 11320 }, { "epoch": 2.45, "learning_rate": 5.580586743933357e-05, "loss": 0.6988, "step": 11340 }, { "epoch": 2.45, "learning_rate": 5.537124230351322e-05, "loss": 0.7047, "step": 11360 }, { "epoch": 2.46, "learning_rate": 5.493661716769286e-05, "loss": 0.6946, "step": 11380 }, { "epoch": 2.46, "learning_rate": 5.45019920318725e-05, "loss": 0.7096, "step": 11400 }, { "epoch": 2.46, "eval_loss": 0.7155815958976746, "eval_runtime": 25.525, "eval_samples_per_second": 78.355, "eval_steps_per_second": 1.254, "step": 11400 }, { "epoch": 2.46, "learning_rate": 5.406736689605215e-05, "loss": 0.709, "step": 11420 }, { "epoch": 2.47, "learning_rate": 5.3632741760231794e-05, "loss": 0.7112, "step": 11440 }, { "epoch": 2.47, "learning_rate": 5.319811662441144e-05, "loss": 0.6983, "step": 11460 }, { "epoch": 2.48, "learning_rate": 5.276349148859109e-05, "loss": 0.7, "step": 11480 }, { "epoch": 2.48, "learning_rate": 5.232886635277073e-05, "loss": 0.7006, "step": 11500 }, { "epoch": 2.49, "learning_rate": 5.189424121695037e-05, "loss": 0.7068, "step": 11520 }, { "epoch": 2.49, "learning_rate": 5.1459616081130023e-05, "loss": 0.7012, "step": 11540 }, { "epoch": 2.49, "learning_rate": 5.102499094530967e-05, "loss": 0.7079, "step": 11560 }, { "epoch": 2.5, "learning_rate": 5.059036580948931e-05, "loss": 0.7031, "step": 11580 }, { "epoch": 2.5, "learning_rate": 5.015574067366896e-05, "loss": 0.7038, "step": 11600 }, { "epoch": 2.5, "eval_loss": 0.7149330973625183, "eval_runtime": 25.4843, "eval_samples_per_second": 78.48, "eval_steps_per_second": 1.256, "step": 11600 }, { "epoch": 2.51, "learning_rate": 4.97211155378486e-05, "loss": 0.6972, "step": 11620 }, { "epoch": 2.51, "learning_rate": 4.9286490402028246e-05, "loss": 0.7039, "step": 11640 }, { "epoch": 2.52, "learning_rate": 4.885186526620789e-05, "loss": 0.7052, "step": 11660 }, { "epoch": 2.52, "learning_rate": 4.841724013038754e-05, "loss": 0.7045, "step": 11680 }, { "epoch": 2.52, "learning_rate": 4.798261499456718e-05, "loss": 0.701, "step": 11700 }, { "epoch": 2.53, "learning_rate": 4.7547989858746826e-05, "loss": 0.7084, "step": 11720 }, { "epoch": 2.53, "learning_rate": 4.7113364722926476e-05, "loss": 0.6988, "step": 11740 }, { "epoch": 2.54, "learning_rate": 4.667873958710612e-05, "loss": 0.7155, "step": 11760 }, { "epoch": 2.54, "learning_rate": 4.624411445128576e-05, "loss": 0.7044, "step": 11780 }, { "epoch": 2.55, "learning_rate": 4.5809489315465406e-05, "loss": 0.7014, "step": 11800 }, { "epoch": 2.55, "eval_loss": 0.714367151260376, "eval_runtime": 25.4959, "eval_samples_per_second": 78.444, "eval_steps_per_second": 1.255, "step": 11800 }, { "epoch": 2.55, "learning_rate": 4.537486417964505e-05, "loss": 0.708, "step": 11820 }, { "epoch": 2.55, "learning_rate": 4.494023904382469e-05, "loss": 0.6976, "step": 11840 }, { "epoch": 2.56, "learning_rate": 4.450561390800434e-05, "loss": 0.7057, "step": 11860 }, { "epoch": 2.56, "learning_rate": 4.4070988772183986e-05, "loss": 0.7039, "step": 11880 }, { "epoch": 2.57, "learning_rate": 4.363636363636363e-05, "loss": 0.7089, "step": 11900 }, { "epoch": 2.57, "learning_rate": 4.320173850054328e-05, "loss": 0.7026, "step": 11920 }, { "epoch": 2.58, "learning_rate": 4.276711336472292e-05, "loss": 0.7023, "step": 11940 }, { "epoch": 2.58, "learning_rate": 4.2332488228902565e-05, "loss": 0.7006, "step": 11960 }, { "epoch": 2.58, "learning_rate": 4.1897863093082215e-05, "loss": 0.7008, "step": 11980 }, { "epoch": 2.59, "learning_rate": 4.146323795726186e-05, "loss": 0.7057, "step": 12000 }, { "epoch": 2.59, "eval_loss": 0.7141902446746826, "eval_runtime": 25.5019, "eval_samples_per_second": 78.426, "eval_steps_per_second": 1.255, "step": 12000 }, { "epoch": 2.59, "learning_rate": 4.10286128214415e-05, "loss": 0.7083, "step": 12020 }, { "epoch": 2.6, "learning_rate": 4.059398768562115e-05, "loss": 0.6986, "step": 12040 }, { "epoch": 2.6, "learning_rate": 4.0159362549800795e-05, "loss": 0.7076, "step": 12060 }, { "epoch": 2.61, "learning_rate": 3.972473741398044e-05, "loss": 0.7071, "step": 12080 }, { "epoch": 2.61, "learning_rate": 3.929011227816009e-05, "loss": 0.6984, "step": 12100 }, { "epoch": 2.61, "learning_rate": 3.885548714233973e-05, "loss": 0.7096, "step": 12120 }, { "epoch": 2.62, "learning_rate": 3.8420862006519375e-05, "loss": 0.7027, "step": 12140 }, { "epoch": 2.62, "learning_rate": 3.7986236870699025e-05, "loss": 0.7062, "step": 12160 }, { "epoch": 2.63, "learning_rate": 3.755161173487867e-05, "loss": 0.7049, "step": 12180 }, { "epoch": 2.63, "learning_rate": 3.711698659905831e-05, "loss": 0.7052, "step": 12200 }, { "epoch": 2.63, "eval_loss": 0.7140177488327026, "eval_runtime": 25.4673, "eval_samples_per_second": 78.532, "eval_steps_per_second": 1.257, "step": 12200 }, { "epoch": 2.64, "learning_rate": 3.6682361463237955e-05, "loss": 0.7011, "step": 12220 }, { "epoch": 2.64, "learning_rate": 3.62477363274176e-05, "loss": 0.7025, "step": 12240 }, { "epoch": 2.65, "learning_rate": 3.581311119159725e-05, "loss": 0.7006, "step": 12260 }, { "epoch": 2.65, "learning_rate": 3.537848605577689e-05, "loss": 0.7073, "step": 12280 }, { "epoch": 2.65, "learning_rate": 3.4943860919956534e-05, "loss": 0.7033, "step": 12300 }, { "epoch": 2.66, "learning_rate": 3.4509235784136184e-05, "loss": 0.6992, "step": 12320 }, { "epoch": 2.66, "learning_rate": 3.407461064831582e-05, "loss": 0.7043, "step": 12340 }, { "epoch": 2.67, "learning_rate": 3.363998551249547e-05, "loss": 0.7083, "step": 12360 }, { "epoch": 2.67, "learning_rate": 3.3205360376675114e-05, "loss": 0.7086, "step": 12380 }, { "epoch": 2.68, "learning_rate": 3.277073524085476e-05, "loss": 0.7168, "step": 12400 }, { "epoch": 2.68, "eval_loss": 0.7138265371322632, "eval_runtime": 25.5077, "eval_samples_per_second": 78.408, "eval_steps_per_second": 1.255, "step": 12400 }, { "epoch": 2.68, "learning_rate": 3.233611010503441e-05, "loss": 0.7026, "step": 12420 }, { "epoch": 2.68, "learning_rate": 3.190148496921405e-05, "loss": 0.7097, "step": 12440 }, { "epoch": 2.69, "learning_rate": 3.1466859833393694e-05, "loss": 0.7094, "step": 12460 }, { "epoch": 2.69, "learning_rate": 3.1032234697573344e-05, "loss": 0.6971, "step": 12480 }, { "epoch": 2.7, "learning_rate": 3.059760956175299e-05, "loss": 0.6977, "step": 12500 }, { "epoch": 2.7, "learning_rate": 3.016298442593263e-05, "loss": 0.6945, "step": 12520 }, { "epoch": 2.71, "learning_rate": 2.9728359290112277e-05, "loss": 0.6998, "step": 12540 }, { "epoch": 2.71, "learning_rate": 2.929373415429192e-05, "loss": 0.7067, "step": 12560 }, { "epoch": 2.71, "learning_rate": 2.8859109018471563e-05, "loss": 0.6935, "step": 12580 }, { "epoch": 2.72, "learning_rate": 2.842448388265121e-05, "loss": 0.6927, "step": 12600 }, { "epoch": 2.72, "eval_loss": 0.7132371664047241, "eval_runtime": 25.516, "eval_samples_per_second": 78.382, "eval_steps_per_second": 1.254, "step": 12600 }, { "epoch": 2.72, "learning_rate": 2.7989858746830857e-05, "loss": 0.7025, "step": 12620 }, { "epoch": 2.73, "learning_rate": 2.75552336110105e-05, "loss": 0.7098, "step": 12640 }, { "epoch": 2.73, "learning_rate": 2.7120608475190147e-05, "loss": 0.6939, "step": 12660 }, { "epoch": 2.74, "learning_rate": 2.6685983339369793e-05, "loss": 0.7038, "step": 12680 }, { "epoch": 2.74, "learning_rate": 2.6251358203549436e-05, "loss": 0.7039, "step": 12700 }, { "epoch": 2.74, "learning_rate": 2.5816733067729083e-05, "loss": 0.7018, "step": 12720 }, { "epoch": 2.75, "learning_rate": 2.538210793190873e-05, "loss": 0.6943, "step": 12740 }, { "epoch": 2.75, "learning_rate": 2.4947482796088373e-05, "loss": 0.7007, "step": 12760 }, { "epoch": 2.76, "learning_rate": 2.4512857660268016e-05, "loss": 0.7019, "step": 12780 }, { "epoch": 2.76, "learning_rate": 2.407823252444766e-05, "loss": 0.6957, "step": 12800 }, { "epoch": 2.76, "eval_loss": 0.7126932144165039, "eval_runtime": 25.4915, "eval_samples_per_second": 78.458, "eval_steps_per_second": 1.255, "step": 12800 }, { "epoch": 2.77, "learning_rate": 2.3643607388627306e-05, "loss": 0.6993, "step": 12820 }, { "epoch": 2.77, "learning_rate": 2.3208982252806953e-05, "loss": 0.6951, "step": 12840 }, { "epoch": 2.77, "learning_rate": 2.2774357116986596e-05, "loss": 0.7056, "step": 12860 }, { "epoch": 2.78, "learning_rate": 2.2339731981166243e-05, "loss": 0.7153, "step": 12880 }, { "epoch": 2.78, "learning_rate": 2.190510684534589e-05, "loss": 0.7022, "step": 12900 }, { "epoch": 2.79, "learning_rate": 2.1470481709525532e-05, "loss": 0.7078, "step": 12920 }, { "epoch": 2.79, "learning_rate": 2.103585657370518e-05, "loss": 0.6969, "step": 12940 }, { "epoch": 2.8, "learning_rate": 2.0601231437884826e-05, "loss": 0.7056, "step": 12960 }, { "epoch": 2.8, "learning_rate": 2.016660630206447e-05, "loss": 0.6975, "step": 12980 }, { "epoch": 2.8, "learning_rate": 1.9731981166244112e-05, "loss": 0.7065, "step": 13000 }, { "epoch": 2.8, "eval_loss": 0.7130131721496582, "eval_runtime": 25.4905, "eval_samples_per_second": 78.461, "eval_steps_per_second": 1.255, "step": 13000 }, { "epoch": 2.81, "learning_rate": 1.9297356030423755e-05, "loss": 0.7, "step": 13020 }, { "epoch": 2.81, "learning_rate": 1.8862730894603402e-05, "loss": 0.7144, "step": 13040 }, { "epoch": 2.82, "learning_rate": 1.842810575878305e-05, "loss": 0.6964, "step": 13060 }, { "epoch": 2.82, "learning_rate": 1.7993480622962692e-05, "loss": 0.6981, "step": 13080 }, { "epoch": 2.83, "learning_rate": 1.755885548714234e-05, "loss": 0.7102, "step": 13100 }, { "epoch": 2.83, "learning_rate": 1.7124230351321985e-05, "loss": 0.6975, "step": 13120 }, { "epoch": 2.83, "learning_rate": 1.668960521550163e-05, "loss": 0.7062, "step": 13140 }, { "epoch": 2.84, "learning_rate": 1.625498007968127e-05, "loss": 0.6956, "step": 13160 }, { "epoch": 2.84, "learning_rate": 1.5820354943860918e-05, "loss": 0.71, "step": 13180 }, { "epoch": 2.85, "learning_rate": 1.5385729808040565e-05, "loss": 0.7081, "step": 13200 }, { "epoch": 2.85, "eval_loss": 0.7126001119613647, "eval_runtime": 25.5102, "eval_samples_per_second": 78.4, "eval_steps_per_second": 1.254, "step": 13200 }, { "epoch": 2.85, "learning_rate": 1.495110467222021e-05, "loss": 0.6977, "step": 13220 }, { "epoch": 2.86, "learning_rate": 1.4516479536399855e-05, "loss": 0.705, "step": 13240 }, { "epoch": 2.86, "learning_rate": 1.4081854400579498e-05, "loss": 0.7016, "step": 13260 }, { "epoch": 2.87, "learning_rate": 1.3647229264759143e-05, "loss": 0.6922, "step": 13280 }, { "epoch": 2.87, "learning_rate": 1.321260412893879e-05, "loss": 0.6987, "step": 13300 }, { "epoch": 2.87, "learning_rate": 1.2777978993118434e-05, "loss": 0.7041, "step": 13320 }, { "epoch": 2.88, "learning_rate": 1.234335385729808e-05, "loss": 0.7101, "step": 13340 }, { "epoch": 2.88, "learning_rate": 1.1908728721477723e-05, "loss": 0.6976, "step": 13360 }, { "epoch": 2.89, "learning_rate": 1.147410358565737e-05, "loss": 0.7011, "step": 13380 }, { "epoch": 2.89, "learning_rate": 1.1039478449837014e-05, "loss": 0.6973, "step": 13400 }, { "epoch": 2.89, "eval_loss": 0.7123447060585022, "eval_runtime": 25.5029, "eval_samples_per_second": 78.422, "eval_steps_per_second": 1.255, "step": 13400 }, { "epoch": 2.9, "learning_rate": 1.060485331401666e-05, "loss": 0.7084, "step": 13420 }, { "epoch": 2.9, "learning_rate": 1.0170228178196306e-05, "loss": 0.7133, "step": 13440 }, { "epoch": 2.9, "learning_rate": 9.73560304237595e-06, "loss": 0.6988, "step": 13460 }, { "epoch": 2.91, "learning_rate": 9.300977906555596e-06, "loss": 0.7045, "step": 13480 }, { "epoch": 2.91, "learning_rate": 8.866352770735239e-06, "loss": 0.6985, "step": 13500 }, { "epoch": 2.92, "learning_rate": 8.431727634914886e-06, "loss": 0.6967, "step": 13520 }, { "epoch": 2.92, "learning_rate": 7.99710249909453e-06, "loss": 0.7008, "step": 13540 }, { "epoch": 2.93, "learning_rate": 7.562477363274175e-06, "loss": 0.6956, "step": 13560 }, { "epoch": 2.93, "learning_rate": 7.12785222745382e-06, "loss": 0.7065, "step": 13580 }, { "epoch": 2.93, "learning_rate": 6.693227091633466e-06, "loss": 0.7018, "step": 13600 }, { "epoch": 2.93, "eval_loss": 0.712183952331543, "eval_runtime": 25.5697, "eval_samples_per_second": 78.218, "eval_steps_per_second": 1.251, "step": 13600 } ], "max_steps": 13905, "num_train_epochs": 3, "total_flos": 6.876240637723253e+19, "trial_name": null, "trial_params": null }