diff --git "a/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/trainer_state.json" "b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/alpaca-lora-based-origin-llama7b/lora-alpaca-cn-remote-1M/checkpoint-15200/trainer_state.json" @@ -0,0 +1,5184 @@ +{ + "best_metric": 0.6878132224082947, + "best_model_checkpoint": "lora-alpaca/checkpoint-15200", + "epoch": 1.9494677440041042, + "global_step": 15200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.8988, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011999999999999999, + "loss": 0.7184, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017999999999999998, + "loss": 0.7227, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00023999999999999998, + "loss": 0.7244, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003, + "loss": 0.7225, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002996127533238673, + "loss": 0.7183, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002992255066477346, + "loss": 0.7246, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029883825997160186, + "loss": 0.7334, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002984510132954692, + "loss": 0.7224, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002980637666193365, + "loss": 0.7212, + "step": 200 + }, + { + "epoch": 0.03, + "eval_loss": 0.7327759861946106, + "eval_runtime": 178.5232, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002976765199432038, + "loss": 0.7298, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002972892732670711, + "loss": 0.7275, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002969020265909384, + "loss": 0.7265, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002965147799148057, + "loss": 0.7285, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 0.000296127533238673, + "loss": 0.7218, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002957402865625403, + "loss": 0.715, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029535303988640764, + "loss": 0.7347, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002949657932102749, + "loss": 0.7228, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029457854653414225, + "loss": 0.7198, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029419129985800953, + "loss": 0.7196, + "step": 400 + }, + { + "epoch": 0.05, + "eval_loss": 0.7325090765953064, + "eval_runtime": 178.6018, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002938040531818768, + "loss": 0.7233, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029341680650574414, + "loss": 0.7272, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002930295598296114, + "loss": 0.7272, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029264231315347876, + "loss": 0.7281, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029225506647734603, + "loss": 0.7289, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002918678198012133, + "loss": 0.7215, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029148057312508065, + "loss": 0.7234, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002910933264489479, + "loss": 0.7229, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029070607977281526, + "loss": 0.7277, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002903188330966826, + "loss": 0.7275, + "step": 600 + }, + { + "epoch": 0.08, + "eval_loss": 0.7319443821907043, + "eval_runtime": 178.6932, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00028993158642054987, + "loss": 0.7195, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 0.00028954433974441715, + "loss": 0.723, + "step": 640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002891570930682845, + "loss": 0.7337, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 0.00028876984639215176, + "loss": 0.7249, + "step": 680 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002883825997160191, + "loss": 0.7374, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 0.00028799535303988637, + "loss": 0.7144, + "step": 720 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002876081063637537, + "loss": 0.7213, + "step": 740 + }, + { + "epoch": 0.1, + "learning_rate": 0.000287220859687621, + "loss": 0.7215, + "step": 760 + }, + { + "epoch": 0.1, + "learning_rate": 0.00028683361301148826, + "loss": 0.7275, + "step": 780 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002864463663353556, + "loss": 0.7247, + "step": 800 + }, + { + "epoch": 0.1, + "eval_loss": 0.730565071105957, + "eval_runtime": 178.4749, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002860591196592229, + "loss": 0.717, + "step": 820 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002856718729830902, + "loss": 0.7263, + "step": 840 + }, + { + "epoch": 0.11, + "learning_rate": 0.00028528462630695754, + "loss": 0.7188, + "step": 860 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002848973796308248, + "loss": 0.7241, + "step": 880 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002845101329546921, + "loss": 0.7257, + "step": 900 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028412288627855943, + "loss": 0.7315, + "step": 920 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002837356396024267, + "loss": 0.7239, + "step": 940 + }, + { + "epoch": 0.12, + "learning_rate": 0.00028334839292629404, + "loss": 0.7219, + "step": 960 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002829611462501613, + "loss": 0.7257, + "step": 980 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002825738995740286, + "loss": 0.7287, + "step": 1000 + }, + { + "epoch": 0.13, + "eval_loss": 0.7289888858795166, + "eval_runtime": 178.5088, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.4, + "step": 1000 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028218665289789593, + "loss": 0.7173, + "step": 1020 + }, + { + "epoch": 0.13, + "learning_rate": 0.00028179940622176326, + "loss": 0.7185, + "step": 1040 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028141215954563054, + "loss": 0.7204, + "step": 1060 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002810249128694979, + "loss": 0.7301, + "step": 1080 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028063766619336515, + "loss": 0.7254, + "step": 1100 + }, + { + "epoch": 0.14, + "learning_rate": 0.00028025041951723243, + "loss": 0.7212, + "step": 1120 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027986317284109976, + "loss": 0.7265, + "step": 1140 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027947592616496704, + "loss": 0.7201, + "step": 1160 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002790886794888344, + "loss": 0.7273, + "step": 1180 + }, + { + "epoch": 0.15, + "learning_rate": 0.00027870143281270165, + "loss": 0.72, + "step": 1200 + }, + { + "epoch": 0.15, + "eval_loss": 0.727615237236023, + "eval_runtime": 178.7045, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 1200 + }, + { + "epoch": 0.16, + "learning_rate": 0.000278314186136569, + "loss": 0.7307, + "step": 1220 + }, + { + "epoch": 0.16, + "learning_rate": 0.00027792693946043627, + "loss": 0.7164, + "step": 1240 + }, + { + "epoch": 0.16, + "learning_rate": 0.00027753969278430354, + "loss": 0.7163, + "step": 1260 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002771524461081709, + "loss": 0.7127, + "step": 1280 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002767651994320382, + "loss": 0.7123, + "step": 1300 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002763779527559055, + "loss": 0.728, + "step": 1320 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002759907060797728, + "loss": 0.7263, + "step": 1340 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002756034594036401, + "loss": 0.7188, + "step": 1360 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002752162127275074, + "loss": 0.7142, + "step": 1380 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002748289660513747, + "loss": 0.7215, + "step": 1400 + }, + { + "epoch": 0.18, + "eval_loss": 0.7257346510887146, + "eval_runtime": 178.9403, + "eval_samples_per_second": 11.177, + "eval_steps_per_second": 1.397, + "step": 1400 + }, + { + "epoch": 0.18, + "learning_rate": 0.000274441719375242, + "loss": 0.7262, + "step": 1420 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002740544726991093, + "loss": 0.7185, + "step": 1440 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002736672260229766, + "loss": 0.7093, + "step": 1460 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002732799793468439, + "loss": 0.7183, + "step": 1480 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002728927326707112, + "loss": 0.7006, + "step": 1500 + }, + { + "epoch": 0.19, + "learning_rate": 0.00027250548599457855, + "loss": 0.7168, + "step": 1520 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002721182393184458, + "loss": 0.7225, + "step": 1540 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027173099264231316, + "loss": 0.7248, + "step": 1560 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027134374596618044, + "loss": 0.7215, + "step": 1580 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002709564992900477, + "loss": 0.7179, + "step": 1600 + }, + { + "epoch": 0.21, + "eval_loss": 0.7245064377784729, + "eval_runtime": 177.8967, + "eval_samples_per_second": 11.242, + "eval_steps_per_second": 1.405, + "step": 1600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027056925261391505, + "loss": 0.7208, + "step": 1620 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002701820059377823, + "loss": 0.7213, + "step": 1640 + }, + { + "epoch": 0.21, + "learning_rate": 0.00026979475926164966, + "loss": 0.7221, + "step": 1660 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026940751258551694, + "loss": 0.7207, + "step": 1680 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026902026590938427, + "loss": 0.7218, + "step": 1700 + }, + { + "epoch": 0.22, + "learning_rate": 0.00026863301923325155, + "loss": 0.718, + "step": 1720 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002682457725571189, + "loss": 0.7235, + "step": 1740 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026785852588098616, + "loss": 0.7138, + "step": 1760 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002674712792048535, + "loss": 0.7178, + "step": 1780 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026708403252872077, + "loss": 0.7127, + "step": 1800 + }, + { + "epoch": 0.23, + "eval_loss": 0.7234225869178772, + "eval_runtime": 177.3731, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.409, + "step": 1800 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002666967858525881, + "loss": 0.7214, + "step": 1820 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002663095391764554, + "loss": 0.7164, + "step": 1840 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026592229250032266, + "loss": 0.7132, + "step": 1860 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026553504582419, + "loss": 0.718, + "step": 1880 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002651477991480573, + "loss": 0.7088, + "step": 1900 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002647605524719246, + "loss": 0.7145, + "step": 1920 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026437330579579194, + "loss": 0.7166, + "step": 1940 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026398605911965916, + "loss": 0.7141, + "step": 1960 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002635988124435265, + "loss": 0.708, + "step": 1980 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026321156576739383, + "loss": 0.7162, + "step": 2000 + }, + { + "epoch": 0.26, + "eval_loss": 0.7224385142326355, + "eval_runtime": 177.124, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 2000 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002628243190912611, + "loss": 0.7091, + "step": 2020 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026243707241512844, + "loss": 0.7133, + "step": 2040 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002620498257389957, + "loss": 0.717, + "step": 2060 + }, + { + "epoch": 0.27, + "learning_rate": 0.000261662579062863, + "loss": 0.7246, + "step": 2080 + }, + { + "epoch": 0.27, + "learning_rate": 0.00026127533238673033, + "loss": 0.7169, + "step": 2100 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002608880857105976, + "loss": 0.7121, + "step": 2120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00026050083903446494, + "loss": 0.719, + "step": 2140 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002601135923583322, + "loss": 0.7236, + "step": 2160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025972634568219955, + "loss": 0.7154, + "step": 2180 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025933909900606683, + "loss": 0.7148, + "step": 2200 + }, + { + "epoch": 0.28, + "eval_loss": 0.7211937308311462, + "eval_runtime": 177.2195, + "eval_samples_per_second": 11.285, + "eval_steps_per_second": 1.411, + "step": 2200 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025895185232993416, + "loss": 0.7193, + "step": 2220 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025856460565380144, + "loss": 0.7046, + "step": 2240 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002581773589776688, + "loss": 0.7231, + "step": 2260 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025779011230153605, + "loss": 0.719, + "step": 2280 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025740286562540333, + "loss": 0.707, + "step": 2300 + }, + { + "epoch": 0.3, + "learning_rate": 0.00025701561894927067, + "loss": 0.7176, + "step": 2320 + }, + { + "epoch": 0.3, + "learning_rate": 0.00025662837227313794, + "loss": 0.7205, + "step": 2340 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002562411255970053, + "loss": 0.7162, + "step": 2360 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025585387892087256, + "loss": 0.7223, + "step": 2380 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002554666322447399, + "loss": 0.7158, + "step": 2400 + }, + { + "epoch": 0.31, + "eval_loss": 0.7201904654502869, + "eval_runtime": 177.1891, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 2400 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025507938556860717, + "loss": 0.7155, + "step": 2420 + }, + { + "epoch": 0.31, + "learning_rate": 0.00025469213889247445, + "loss": 0.7141, + "step": 2440 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002543048922163418, + "loss": 0.7206, + "step": 2460 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002539176455402091, + "loss": 0.7147, + "step": 2480 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002535303988640764, + "loss": 0.7116, + "step": 2500 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002531431521879437, + "loss": 0.7161, + "step": 2520 + }, + { + "epoch": 0.33, + "learning_rate": 0.000252755905511811, + "loss": 0.7173, + "step": 2540 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002523686588356783, + "loss": 0.7159, + "step": 2560 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002519814121595456, + "loss": 0.7182, + "step": 2580 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002515941654834129, + "loss": 0.723, + "step": 2600 + }, + { + "epoch": 0.33, + "eval_loss": 0.7193037867546082, + "eval_runtime": 177.0591, + "eval_samples_per_second": 11.296, + "eval_steps_per_second": 1.412, + "step": 2600 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002512069188072802, + "loss": 0.717, + "step": 2620 + }, + { + "epoch": 0.34, + "learning_rate": 0.00025081967213114756, + "loss": 0.7164, + "step": 2640 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002504324254550148, + "loss": 0.7173, + "step": 2660 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002500451787788821, + "loss": 0.7154, + "step": 2680 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024965793210274945, + "loss": 0.7101, + "step": 2700 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002492706854266167, + "loss": 0.7088, + "step": 2720 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024888343875048406, + "loss": 0.715, + "step": 2740 + }, + { + "epoch": 0.35, + "learning_rate": 0.00024849619207435134, + "loss": 0.7165, + "step": 2760 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002481089453982186, + "loss": 0.7094, + "step": 2780 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024772169872208595, + "loss": 0.7116, + "step": 2800 + }, + { + "epoch": 0.36, + "eval_loss": 0.7178795337677002, + "eval_runtime": 177.1788, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.411, + "step": 2800 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024733445204595323, + "loss": 0.7121, + "step": 2820 + }, + { + "epoch": 0.36, + "learning_rate": 0.00024694720536982056, + "loss": 0.7197, + "step": 2840 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024655995869368784, + "loss": 0.7133, + "step": 2860 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024617271201755517, + "loss": 0.7124, + "step": 2880 + }, + { + "epoch": 0.37, + "learning_rate": 0.00024578546534142245, + "loss": 0.7081, + "step": 2900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002453982186652898, + "loss": 0.7105, + "step": 2920 + }, + { + "epoch": 0.38, + "learning_rate": 0.00024501097198915706, + "loss": 0.7005, + "step": 2940 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002446237253130244, + "loss": 0.7111, + "step": 2960 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002442364786368917, + "loss": 0.7035, + "step": 2980 + }, + { + "epoch": 0.38, + "learning_rate": 0.000243849231960759, + "loss": 0.7125, + "step": 3000 + }, + { + "epoch": 0.38, + "eval_loss": 0.7173203229904175, + "eval_runtime": 176.9402, + "eval_samples_per_second": 11.303, + "eval_steps_per_second": 1.413, + "step": 3000 + }, + { + "epoch": 0.39, + "learning_rate": 0.00024346198528462626, + "loss": 0.7143, + "step": 3020 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002430747386084936, + "loss": 0.7121, + "step": 3040 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002426874919323609, + "loss": 0.7093, + "step": 3060 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002423002452562282, + "loss": 0.711, + "step": 3080 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002419129985800955, + "loss": 0.7239, + "step": 3100 + }, + { + "epoch": 0.4, + "learning_rate": 0.00024152575190396281, + "loss": 0.7183, + "step": 3120 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002411385052278301, + "loss": 0.7056, + "step": 3140 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002407512585516974, + "loss": 0.7109, + "step": 3160 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002403640118755647, + "loss": 0.7183, + "step": 3180 + }, + { + "epoch": 0.41, + "learning_rate": 0.000239976765199432, + "loss": 0.7135, + "step": 3200 + }, + { + "epoch": 0.41, + "eval_loss": 0.7156603932380676, + "eval_runtime": 178.5954, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 3200 + }, + { + "epoch": 0.41, + "learning_rate": 0.00023958951852329932, + "loss": 0.7022, + "step": 3220 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023920227184716665, + "loss": 0.7155, + "step": 3240 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002388150251710339, + "loss": 0.7072, + "step": 3260 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023842777849490123, + "loss": 0.7151, + "step": 3280 + }, + { + "epoch": 0.42, + "learning_rate": 0.00023804053181876854, + "loss": 0.7044, + "step": 3300 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023765328514263584, + "loss": 0.7141, + "step": 3320 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023726603846650315, + "loss": 0.7033, + "step": 3340 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023687879179037046, + "loss": 0.7137, + "step": 3360 + }, + { + "epoch": 0.43, + "learning_rate": 0.00023649154511423773, + "loss": 0.7042, + "step": 3380 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023610429843810504, + "loss": 0.7156, + "step": 3400 + }, + { + "epoch": 0.44, + "eval_loss": 0.7149476408958435, + "eval_runtime": 178.5798, + "eval_samples_per_second": 11.199, + "eval_steps_per_second": 1.4, + "step": 3400 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023571705176197235, + "loss": 0.7045, + "step": 3420 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023532980508583965, + "loss": 0.7021, + "step": 3440 + }, + { + "epoch": 0.44, + "learning_rate": 0.00023494255840970698, + "loss": 0.7092, + "step": 3460 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002345553117335743, + "loss": 0.7213, + "step": 3480 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023416806505744157, + "loss": 0.7046, + "step": 3500 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023378081838130887, + "loss": 0.7076, + "step": 3520 + }, + { + "epoch": 0.45, + "learning_rate": 0.00023339357170517618, + "loss": 0.7107, + "step": 3540 + }, + { + "epoch": 0.46, + "learning_rate": 0.00023300632502904349, + "loss": 0.7087, + "step": 3560 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002326190783529108, + "loss": 0.7005, + "step": 3580 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002322318316767781, + "loss": 0.7064, + "step": 3600 + }, + { + "epoch": 0.46, + "eval_loss": 0.7140311002731323, + "eval_runtime": 179.0415, + "eval_samples_per_second": 11.171, + "eval_steps_per_second": 1.396, + "step": 3600 + }, + { + "epoch": 0.46, + "learning_rate": 0.00023184458500064538, + "loss": 0.714, + "step": 3620 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023145733832451268, + "loss": 0.7102, + "step": 3640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023107009164838, + "loss": 0.7202, + "step": 3660 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002306828449722473, + "loss": 0.7016, + "step": 3680 + }, + { + "epoch": 0.47, + "learning_rate": 0.00023029559829611463, + "loss": 0.7126, + "step": 3700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022990835161998193, + "loss": 0.7055, + "step": 3720 + }, + { + "epoch": 0.48, + "learning_rate": 0.0002295211049438492, + "loss": 0.7118, + "step": 3740 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022913385826771652, + "loss": 0.707, + "step": 3760 + }, + { + "epoch": 0.48, + "learning_rate": 0.00022874661159158382, + "loss": 0.7119, + "step": 3780 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022835936491545113, + "loss": 0.7023, + "step": 3800 + }, + { + "epoch": 0.49, + "eval_loss": 0.7134947776794434, + "eval_runtime": 179.7115, + "eval_samples_per_second": 11.129, + "eval_steps_per_second": 1.391, + "step": 3800 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022797211823931843, + "loss": 0.6967, + "step": 3820 + }, + { + "epoch": 0.49, + "learning_rate": 0.00022758487156318574, + "loss": 0.7172, + "step": 3840 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022719762488705302, + "loss": 0.7137, + "step": 3860 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022681037821092032, + "loss": 0.7164, + "step": 3880 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022642313153478763, + "loss": 0.7099, + "step": 3900 + }, + { + "epoch": 0.5, + "learning_rate": 0.00022603588485865494, + "loss": 0.7119, + "step": 3920 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022564863818252227, + "loss": 0.7098, + "step": 3940 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022526139150638957, + "loss": 0.7067, + "step": 3960 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022487414483025685, + "loss": 0.705, + "step": 3980 + }, + { + "epoch": 0.51, + "learning_rate": 0.00022448689815412416, + "loss": 0.7125, + "step": 4000 + }, + { + "epoch": 0.51, + "eval_loss": 0.7128713130950928, + "eval_runtime": 178.8128, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 4000 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022409965147799146, + "loss": 0.7098, + "step": 4020 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022371240480185877, + "loss": 0.7081, + "step": 4040 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022332515812572608, + "loss": 0.6982, + "step": 4060 + }, + { + "epoch": 0.52, + "learning_rate": 0.00022293791144959338, + "loss": 0.7122, + "step": 4080 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022255066477346066, + "loss": 0.6974, + "step": 4100 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022216341809732797, + "loss": 0.7018, + "step": 4120 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022177617142119527, + "loss": 0.7075, + "step": 4140 + }, + { + "epoch": 0.53, + "learning_rate": 0.00022138892474506258, + "loss": 0.7013, + "step": 4160 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002210016780689299, + "loss": 0.7103, + "step": 4180 + }, + { + "epoch": 0.54, + "learning_rate": 0.00022061443139279722, + "loss": 0.7005, + "step": 4200 + }, + { + "epoch": 0.54, + "eval_loss": 0.7116231918334961, + "eval_runtime": 179.0816, + "eval_samples_per_second": 11.168, + "eval_steps_per_second": 1.396, + "step": 4200 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002202271847166645, + "loss": 0.7071, + "step": 4220 + }, + { + "epoch": 0.54, + "learning_rate": 0.0002198399380405318, + "loss": 0.7114, + "step": 4240 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002194526913643991, + "loss": 0.705, + "step": 4260 + }, + { + "epoch": 0.55, + "learning_rate": 0.0002190654446882664, + "loss": 0.706, + "step": 4280 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021867819801213372, + "loss": 0.7016, + "step": 4300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00021829095133600102, + "loss": 0.7084, + "step": 4320 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002179037046598683, + "loss": 0.7187, + "step": 4340 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002175164579837356, + "loss": 0.7044, + "step": 4360 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002171292113076029, + "loss": 0.7068, + "step": 4380 + }, + { + "epoch": 0.56, + "learning_rate": 0.00021674196463147025, + "loss": 0.7082, + "step": 4400 + }, + { + "epoch": 0.56, + "eval_loss": 0.7112064957618713, + "eval_runtime": 178.8847, + "eval_samples_per_second": 11.18, + "eval_steps_per_second": 1.398, + "step": 4400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021635471795533755, + "loss": 0.705, + "step": 4420 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021596747127920486, + "loss": 0.704, + "step": 4440 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021558022460307214, + "loss": 0.7071, + "step": 4460 + }, + { + "epoch": 0.57, + "learning_rate": 0.00021519297792693944, + "loss": 0.708, + "step": 4480 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021480573125080675, + "loss": 0.705, + "step": 4500 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021441848457467405, + "loss": 0.7061, + "step": 4520 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021403123789854136, + "loss": 0.7074, + "step": 4540 + }, + { + "epoch": 0.58, + "learning_rate": 0.00021364399122240864, + "loss": 0.7148, + "step": 4560 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021325674454627594, + "loss": 0.7091, + "step": 4580 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021286949787014325, + "loss": 0.7103, + "step": 4600 + }, + { + "epoch": 0.59, + "eval_loss": 0.7104864716529846, + "eval_runtime": 178.4457, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 4600 + }, + { + "epoch": 0.59, + "learning_rate": 0.00021248225119401055, + "loss": 0.706, + "step": 4620 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002120950045178779, + "loss": 0.6966, + "step": 4640 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002117077578417452, + "loss": 0.6991, + "step": 4660 + }, + { + "epoch": 0.6, + "learning_rate": 0.00021132051116561247, + "loss": 0.7039, + "step": 4680 + }, + { + "epoch": 0.6, + "learning_rate": 0.00021093326448947978, + "loss": 0.7059, + "step": 4700 + }, + { + "epoch": 0.61, + "learning_rate": 0.00021054601781334708, + "loss": 0.7122, + "step": 4720 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002101587711372144, + "loss": 0.7099, + "step": 4740 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002097715244610817, + "loss": 0.6998, + "step": 4760 + }, + { + "epoch": 0.61, + "learning_rate": 0.000209384277784949, + "loss": 0.7048, + "step": 4780 + }, + { + "epoch": 0.62, + "learning_rate": 0.00020899703110881628, + "loss": 0.7077, + "step": 4800 + }, + { + "epoch": 0.62, + "eval_loss": 0.7102417945861816, + "eval_runtime": 178.4169, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 4800 + }, + { + "epoch": 0.62, + "learning_rate": 0.00020860978443268359, + "loss": 0.7172, + "step": 4820 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002082225377565509, + "loss": 0.7084, + "step": 4840 + }, + { + "epoch": 0.62, + "learning_rate": 0.0002078352910804182, + "loss": 0.7058, + "step": 4860 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020744804440428553, + "loss": 0.6988, + "step": 4880 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020706079772815284, + "loss": 0.7008, + "step": 4900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020667355105202011, + "loss": 0.6986, + "step": 4920 + }, + { + "epoch": 0.63, + "learning_rate": 0.00020628630437588742, + "loss": 0.7042, + "step": 4940 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020589905769975473, + "loss": 0.7139, + "step": 4960 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020551181102362203, + "loss": 0.7094, + "step": 4980 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020512456434748934, + "loss": 0.7059, + "step": 5000 + }, + { + "epoch": 0.64, + "eval_loss": 0.7092374563217163, + "eval_runtime": 177.3439, + "eval_samples_per_second": 11.278, + "eval_steps_per_second": 1.41, + "step": 5000 + }, + { + "epoch": 0.64, + "learning_rate": 0.00020473731767135664, + "loss": 0.7042, + "step": 5020 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020435007099522392, + "loss": 0.6964, + "step": 5040 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020396282431909123, + "loss": 0.7041, + "step": 5060 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020357557764295853, + "loss": 0.6972, + "step": 5080 + }, + { + "epoch": 0.65, + "learning_rate": 0.00020318833096682587, + "loss": 0.7011, + "step": 5100 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020280108429069317, + "loss": 0.7073, + "step": 5120 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020241383761456048, + "loss": 0.706, + "step": 5140 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020202659093842776, + "loss": 0.6949, + "step": 5160 + }, + { + "epoch": 0.66, + "learning_rate": 0.00020163934426229506, + "loss": 0.703, + "step": 5180 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020125209758616237, + "loss": 0.7058, + "step": 5200 + }, + { + "epoch": 0.67, + "eval_loss": 0.7084789276123047, + "eval_runtime": 177.3051, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 5200 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020086485091002967, + "loss": 0.7032, + "step": 5220 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020047760423389698, + "loss": 0.7045, + "step": 5240 + }, + { + "epoch": 0.67, + "learning_rate": 0.00020009035755776428, + "loss": 0.7069, + "step": 5260 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019970311088163156, + "loss": 0.6961, + "step": 5280 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019931586420549887, + "loss": 0.6981, + "step": 5300 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019892861752936617, + "loss": 0.701, + "step": 5320 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001985413708532335, + "loss": 0.6979, + "step": 5340 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001981541241771008, + "loss": 0.6975, + "step": 5360 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019776687750096812, + "loss": 0.6923, + "step": 5380 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001973796308248354, + "loss": 0.7089, + "step": 5400 + }, + { + "epoch": 0.69, + "eval_loss": 0.7074704170227051, + "eval_runtime": 177.1889, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 5400 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001969923841487027, + "loss": 0.6986, + "step": 5420 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019660513747257, + "loss": 0.6991, + "step": 5440 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019621789079643731, + "loss": 0.6946, + "step": 5460 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019583064412030462, + "loss": 0.7057, + "step": 5480 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019544339744417193, + "loss": 0.694, + "step": 5500 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001950561507680392, + "loss": 0.7046, + "step": 5520 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001946689040919065, + "loss": 0.6998, + "step": 5540 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019428165741577382, + "loss": 0.6995, + "step": 5560 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019389441073964115, + "loss": 0.7081, + "step": 5580 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019350716406350845, + "loss": 0.6974, + "step": 5600 + }, + { + "epoch": 0.72, + "eval_loss": 0.7067714333534241, + "eval_runtime": 177.0363, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 5600 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019311991738737576, + "loss": 0.7019, + "step": 5620 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019273267071124304, + "loss": 0.7003, + "step": 5640 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019234542403511034, + "loss": 0.6966, + "step": 5660 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019195817735897765, + "loss": 0.7055, + "step": 5680 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019157093068284496, + "loss": 0.7069, + "step": 5700 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019118368400671226, + "loss": 0.6981, + "step": 5720 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019079643733057957, + "loss": 0.7005, + "step": 5740 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019040919065444685, + "loss": 0.7033, + "step": 5760 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019002194397831415, + "loss": 0.7009, + "step": 5780 + }, + { + "epoch": 0.74, + "learning_rate": 0.00018963469730218146, + "loss": 0.7001, + "step": 5800 + }, + { + "epoch": 0.74, + "eval_loss": 0.7066617608070374, + "eval_runtime": 177.2571, + "eval_samples_per_second": 11.283, + "eval_steps_per_second": 1.41, + "step": 5800 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001892474506260488, + "loss": 0.7048, + "step": 5820 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001888602039499161, + "loss": 0.698, + "step": 5840 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001884729572737834, + "loss": 0.7035, + "step": 5860 + }, + { + "epoch": 0.75, + "learning_rate": 0.00018808571059765068, + "loss": 0.6997, + "step": 5880 + }, + { + "epoch": 0.76, + "learning_rate": 0.00018769846392151799, + "loss": 0.7053, + "step": 5900 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001873112172453853, + "loss": 0.6951, + "step": 5920 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001869239705692526, + "loss": 0.701, + "step": 5940 + }, + { + "epoch": 0.76, + "learning_rate": 0.0001865367238931199, + "loss": 0.7032, + "step": 5960 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001861494772169872, + "loss": 0.7005, + "step": 5980 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001857622305408545, + "loss": 0.7013, + "step": 6000 + }, + { + "epoch": 0.77, + "eval_loss": 0.705744743347168, + "eval_runtime": 177.421, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.409, + "step": 6000 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001853749838647218, + "loss": 0.6923, + "step": 6020 + }, + { + "epoch": 0.77, + "learning_rate": 0.00018498773718858913, + "loss": 0.701, + "step": 6040 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018460049051245643, + "loss": 0.7027, + "step": 6060 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018421324383632374, + "loss": 0.6973, + "step": 6080 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018382599716019104, + "loss": 0.6982, + "step": 6100 + }, + { + "epoch": 0.78, + "learning_rate": 0.00018343875048405832, + "loss": 0.7024, + "step": 6120 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018305150380792563, + "loss": 0.6996, + "step": 6140 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018266425713179293, + "loss": 0.7063, + "step": 6160 + }, + { + "epoch": 0.79, + "learning_rate": 0.00018227701045566024, + "loss": 0.7005, + "step": 6180 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018188976377952755, + "loss": 0.6913, + "step": 6200 + }, + { + "epoch": 0.8, + "eval_loss": 0.7049428224563599, + "eval_runtime": 180.0706, + "eval_samples_per_second": 11.107, + "eval_steps_per_second": 1.388, + "step": 6200 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018150251710339485, + "loss": 0.6998, + "step": 6220 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018111527042726213, + "loss": 0.7044, + "step": 6240 + }, + { + "epoch": 0.8, + "learning_rate": 0.00018072802375112944, + "loss": 0.6988, + "step": 6260 + }, + { + "epoch": 0.81, + "learning_rate": 0.00018034077707499677, + "loss": 0.6979, + "step": 6280 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017995353039886407, + "loss": 0.7048, + "step": 6300 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017956628372273138, + "loss": 0.6941, + "step": 6320 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017917903704659869, + "loss": 0.6963, + "step": 6340 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017879179037046596, + "loss": 0.6954, + "step": 6360 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017840454369433327, + "loss": 0.6953, + "step": 6380 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017801729701820058, + "loss": 0.693, + "step": 6400 + }, + { + "epoch": 0.82, + "eval_loss": 0.7036707997322083, + "eval_runtime": 178.6236, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 6400 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017763005034206788, + "loss": 0.7041, + "step": 6420 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001772428036659352, + "loss": 0.6908, + "step": 6440 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001768555569898025, + "loss": 0.6961, + "step": 6460 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017646831031366977, + "loss": 0.6967, + "step": 6480 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017608106363753708, + "loss": 0.7019, + "step": 6500 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001756938169614044, + "loss": 0.7036, + "step": 6520 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017530657028527172, + "loss": 0.6941, + "step": 6540 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017491932360913902, + "loss": 0.6995, + "step": 6560 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017453207693300633, + "loss": 0.6962, + "step": 6580 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001741448302568736, + "loss": 0.6963, + "step": 6600 + }, + { + "epoch": 0.85, + "eval_loss": 0.7036789655685425, + "eval_runtime": 177.1424, + "eval_samples_per_second": 11.29, + "eval_steps_per_second": 1.411, + "step": 6600 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001737575835807409, + "loss": 0.7009, + "step": 6620 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017337033690460822, + "loss": 0.6964, + "step": 6640 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017298309022847552, + "loss": 0.6974, + "step": 6660 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017259584355234283, + "loss": 0.6964, + "step": 6680 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001722085968762101, + "loss": 0.6966, + "step": 6700 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001718213502000774, + "loss": 0.7016, + "step": 6720 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017143410352394475, + "loss": 0.6996, + "step": 6740 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017104685684781205, + "loss": 0.6985, + "step": 6760 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017065961017167936, + "loss": 0.7, + "step": 6780 + }, + { + "epoch": 0.87, + "learning_rate": 0.00017027236349554666, + "loss": 0.6846, + "step": 6800 + }, + { + "epoch": 0.87, + "eval_loss": 0.7028091549873352, + "eval_runtime": 177.0434, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 6800 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016988511681941394, + "loss": 0.6994, + "step": 6820 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016949787014328125, + "loss": 0.6995, + "step": 6840 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016911062346714855, + "loss": 0.6949, + "step": 6860 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016872337679101586, + "loss": 0.6903, + "step": 6880 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016833613011488316, + "loss": 0.6983, + "step": 6900 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016794888343875047, + "loss": 0.6979, + "step": 6920 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016756163676261775, + "loss": 0.6963, + "step": 6940 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016717439008648505, + "loss": 0.6963, + "step": 6960 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001667871434103524, + "loss": 0.7109, + "step": 6980 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001663998967342197, + "loss": 0.6996, + "step": 7000 + }, + { + "epoch": 0.9, + "eval_loss": 0.7027884721755981, + "eval_runtime": 177.0464, + "eval_samples_per_second": 11.296, + "eval_steps_per_second": 1.412, + "step": 7000 + }, + { + "epoch": 0.9, + "learning_rate": 0.000166012650058087, + "loss": 0.6953, + "step": 7020 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001656254033819543, + "loss": 0.701, + "step": 7040 + }, + { + "epoch": 0.91, + "learning_rate": 0.00016523815670582158, + "loss": 0.6941, + "step": 7060 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001648509100296889, + "loss": 0.6946, + "step": 7080 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001644636633535562, + "loss": 0.6905, + "step": 7100 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001640764166774235, + "loss": 0.6938, + "step": 7120 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001636891700012908, + "loss": 0.6964, + "step": 7140 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001633019233251581, + "loss": 0.6979, + "step": 7160 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001629146766490254, + "loss": 0.6909, + "step": 7180 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001625274299728927, + "loss": 0.7017, + "step": 7200 + }, + { + "epoch": 0.92, + "eval_loss": 0.7013801336288452, + "eval_runtime": 177.3532, + "eval_samples_per_second": 11.277, + "eval_steps_per_second": 1.41, + "step": 7200 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016214018329676003, + "loss": 0.6986, + "step": 7220 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016175293662062734, + "loss": 0.6985, + "step": 7240 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016136568994449464, + "loss": 0.694, + "step": 7260 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016097844326836195, + "loss": 0.7124, + "step": 7280 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016059119659222923, + "loss": 0.6936, + "step": 7300 + }, + { + "epoch": 0.94, + "learning_rate": 0.00016020394991609653, + "loss": 0.6904, + "step": 7320 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015981670323996384, + "loss": 0.6994, + "step": 7340 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015942945656383114, + "loss": 0.7046, + "step": 7360 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015904220988769845, + "loss": 0.6999, + "step": 7380 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015865496321156575, + "loss": 0.6952, + "step": 7400 + }, + { + "epoch": 0.95, + "eval_loss": 0.7015686631202698, + "eval_runtime": 178.4607, + "eval_samples_per_second": 11.207, + "eval_steps_per_second": 1.401, + "step": 7400 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015826771653543303, + "loss": 0.7001, + "step": 7420 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015788046985930034, + "loss": 0.6948, + "step": 7440 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015749322318316767, + "loss": 0.702, + "step": 7460 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015710597650703498, + "loss": 0.695, + "step": 7480 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015671872983090228, + "loss": 0.7053, + "step": 7500 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001563314831547696, + "loss": 0.7041, + "step": 7520 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015594423647863687, + "loss": 0.6975, + "step": 7540 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015555698980250417, + "loss": 0.6914, + "step": 7560 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015516974312637148, + "loss": 0.6961, + "step": 7580 + }, + { + "epoch": 0.97, + "learning_rate": 0.00015478249645023878, + "loss": 0.6968, + "step": 7600 + }, + { + "epoch": 0.97, + "eval_loss": 0.7004283666610718, + "eval_runtime": 178.8057, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 7600 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001543952497741061, + "loss": 0.6847, + "step": 7620 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015400800309797342, + "loss": 0.6954, + "step": 7640 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015362075642184067, + "loss": 0.6967, + "step": 7660 + }, + { + "epoch": 0.98, + "learning_rate": 0.000153233509745708, + "loss": 0.6941, + "step": 7680 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001528462630695753, + "loss": 0.6928, + "step": 7700 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015245901639344262, + "loss": 0.7035, + "step": 7720 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015207176971730992, + "loss": 0.6918, + "step": 7740 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015168452304117723, + "loss": 0.6996, + "step": 7760 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001512972763650445, + "loss": 0.6975, + "step": 7780 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015091002968891181, + "loss": 0.7022, + "step": 7800 + }, + { + "epoch": 1.0, + "eval_loss": 0.6998333930969238, + "eval_runtime": 178.2222, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 7800 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015052278301277912, + "loss": 0.6854, + "step": 7820 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015013553633664643, + "loss": 0.6911, + "step": 7840 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014974828966051373, + "loss": 0.6846, + "step": 7860 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014936104298438104, + "loss": 0.6859, + "step": 7880 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014897379630824834, + "loss": 0.6802, + "step": 7900 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014858654963211565, + "loss": 0.6891, + "step": 7920 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014819930295598295, + "loss": 0.6833, + "step": 7940 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014781205627985026, + "loss": 0.6866, + "step": 7960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014742480960371754, + "loss": 0.6863, + "step": 7980 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014703756292758484, + "loss": 0.6898, + "step": 8000 + }, + { + "epoch": 1.03, + "eval_loss": 0.699661135673523, + "eval_runtime": 177.9496, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 8000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014665031625145218, + "loss": 0.6881, + "step": 8020 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014626306957531946, + "loss": 0.6894, + "step": 8040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014587582289918676, + "loss": 0.685, + "step": 8060 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014548857622305407, + "loss": 0.6837, + "step": 8080 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014510132954692137, + "loss": 0.6944, + "step": 8100 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014471408287078868, + "loss": 0.6883, + "step": 8120 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014432683619465598, + "loss": 0.6874, + "step": 8140 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001439395895185233, + "loss": 0.6867, + "step": 8160 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001435523428423906, + "loss": 0.6875, + "step": 8180 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001431650961662579, + "loss": 0.7005, + "step": 8200 + }, + { + "epoch": 1.05, + "eval_loss": 0.699004590511322, + "eval_runtime": 177.1183, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 8200 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014277784949012518, + "loss": 0.6968, + "step": 8220 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014239060281399249, + "loss": 0.6884, + "step": 8240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014200335613785982, + "loss": 0.6808, + "step": 8260 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001416161094617271, + "loss": 0.6851, + "step": 8280 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001412288627855944, + "loss": 0.6917, + "step": 8300 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001408416161094617, + "loss": 0.6944, + "step": 8320 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014045436943332901, + "loss": 0.6851, + "step": 8340 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014006712275719632, + "loss": 0.6829, + "step": 8360 + }, + { + "epoch": 1.07, + "learning_rate": 0.00013967987608106363, + "loss": 0.6872, + "step": 8380 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013929262940493093, + "loss": 0.6909, + "step": 8400 + }, + { + "epoch": 1.08, + "eval_loss": 0.6990391612052917, + "eval_runtime": 177.2945, + "eval_samples_per_second": 11.281, + "eval_steps_per_second": 1.41, + "step": 8400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013890538272879824, + "loss": 0.6924, + "step": 8420 + }, + { + "epoch": 1.08, + "learning_rate": 0.00013851813605266554, + "loss": 0.6747, + "step": 8440 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013813088937653282, + "loss": 0.6932, + "step": 8460 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013774364270040016, + "loss": 0.6892, + "step": 8480 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013735639602426746, + "loss": 0.6868, + "step": 8500 + }, + { + "epoch": 1.09, + "learning_rate": 0.00013696914934813474, + "loss": 0.6898, + "step": 8520 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013658190267200205, + "loss": 0.6896, + "step": 8540 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013619465599586935, + "loss": 0.6848, + "step": 8560 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013580740931973666, + "loss": 0.6801, + "step": 8580 + }, + { + "epoch": 1.1, + "learning_rate": 0.00013542016264360396, + "loss": 0.6927, + "step": 8600 + }, + { + "epoch": 1.1, + "eval_loss": 0.6984953880310059, + "eval_runtime": 177.3114, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 8600 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013503291596747127, + "loss": 0.688, + "step": 8620 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013464566929133857, + "loss": 0.693, + "step": 8640 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013425842261520588, + "loss": 0.6815, + "step": 8660 + }, + { + "epoch": 1.11, + "learning_rate": 0.00013387117593907319, + "loss": 0.6828, + "step": 8680 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013348392926294046, + "loss": 0.6834, + "step": 8700 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001330966825868078, + "loss": 0.6839, + "step": 8720 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001327094359106751, + "loss": 0.6838, + "step": 8740 + }, + { + "epoch": 1.12, + "learning_rate": 0.00013232218923454238, + "loss": 0.6847, + "step": 8760 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001319349425584097, + "loss": 0.6894, + "step": 8780 + }, + { + "epoch": 1.13, + "learning_rate": 0.000131547695882277, + "loss": 0.6922, + "step": 8800 + }, + { + "epoch": 1.13, + "eval_loss": 0.698199450969696, + "eval_runtime": 177.0428, + "eval_samples_per_second": 11.297, + "eval_steps_per_second": 1.412, + "step": 8800 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001311604492061443, + "loss": 0.6904, + "step": 8820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001307732025300116, + "loss": 0.6854, + "step": 8840 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001303859558538789, + "loss": 0.6877, + "step": 8860 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012999870917774622, + "loss": 0.6857, + "step": 8880 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012961146250161352, + "loss": 0.6856, + "step": 8900 + }, + { + "epoch": 1.14, + "learning_rate": 0.00012922421582548083, + "loss": 0.687, + "step": 8920 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001288369691493481, + "loss": 0.6905, + "step": 8940 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012844972247321544, + "loss": 0.6865, + "step": 8960 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012806247579708274, + "loss": 0.6829, + "step": 8980 + }, + { + "epoch": 1.15, + "learning_rate": 0.00012767522912095002, + "loss": 0.696, + "step": 9000 + }, + { + "epoch": 1.15, + "eval_loss": 0.6971157789230347, + "eval_runtime": 177.1296, + "eval_samples_per_second": 11.291, + "eval_steps_per_second": 1.411, + "step": 9000 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012728798244481733, + "loss": 0.6825, + "step": 9020 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012690073576868463, + "loss": 0.6844, + "step": 9040 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012651348909255194, + "loss": 0.6853, + "step": 9060 + }, + { + "epoch": 1.16, + "learning_rate": 0.00012612624241641925, + "loss": 0.6889, + "step": 9080 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012573899574028655, + "loss": 0.6848, + "step": 9100 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012535174906415386, + "loss": 0.6953, + "step": 9120 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012496450238802116, + "loss": 0.6944, + "step": 9140 + }, + { + "epoch": 1.17, + "learning_rate": 0.00012457725571188847, + "loss": 0.6893, + "step": 9160 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012419000903575575, + "loss": 0.6831, + "step": 9180 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012380276235962308, + "loss": 0.683, + "step": 9200 + }, + { + "epoch": 1.18, + "eval_loss": 0.6971254944801331, + "eval_runtime": 177.2118, + "eval_samples_per_second": 11.286, + "eval_steps_per_second": 1.411, + "step": 9200 + }, + { + "epoch": 1.18, + "learning_rate": 0.00012341551568349039, + "loss": 0.6782, + "step": 9220 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012302826900735766, + "loss": 0.6962, + "step": 9240 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012264102233122497, + "loss": 0.6808, + "step": 9260 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001222537756550923, + "loss": 0.6931, + "step": 9280 + }, + { + "epoch": 1.19, + "learning_rate": 0.00012186652897895958, + "loss": 0.6878, + "step": 9300 + }, + { + "epoch": 1.2, + "learning_rate": 0.00012147928230282689, + "loss": 0.6855, + "step": 9320 + }, + { + "epoch": 1.2, + "learning_rate": 0.00012109203562669421, + "loss": 0.6865, + "step": 9340 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001207047889505615, + "loss": 0.6945, + "step": 9360 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001203175422744288, + "loss": 0.6862, + "step": 9380 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011993029559829611, + "loss": 0.6932, + "step": 9400 + }, + { + "epoch": 1.21, + "eval_loss": 0.6963634490966797, + "eval_runtime": 177.2999, + "eval_samples_per_second": 11.28, + "eval_steps_per_second": 1.41, + "step": 9400 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001195430489221634, + "loss": 0.6793, + "step": 9420 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011915580224603071, + "loss": 0.6827, + "step": 9440 + }, + { + "epoch": 1.21, + "learning_rate": 0.00011876855556989803, + "loss": 0.6836, + "step": 9460 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011838130889376532, + "loss": 0.6876, + "step": 9480 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011799406221763263, + "loss": 0.6949, + "step": 9500 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011760681554149993, + "loss": 0.6809, + "step": 9520 + }, + { + "epoch": 1.22, + "learning_rate": 0.00011721956886536722, + "loss": 0.6923, + "step": 9540 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011683232218923453, + "loss": 0.6975, + "step": 9560 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011644507551310185, + "loss": 0.6981, + "step": 9580 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011605782883696914, + "loss": 0.6843, + "step": 9600 + }, + { + "epoch": 1.23, + "eval_loss": 0.6958213448524475, + "eval_runtime": 177.1541, + "eval_samples_per_second": 11.29, + "eval_steps_per_second": 1.411, + "step": 9600 + }, + { + "epoch": 1.23, + "learning_rate": 0.00011567058216083645, + "loss": 0.6904, + "step": 9620 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011528333548470375, + "loss": 0.6761, + "step": 9640 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011489608880857104, + "loss": 0.6933, + "step": 9660 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011450884213243835, + "loss": 0.6913, + "step": 9680 + }, + { + "epoch": 1.24, + "learning_rate": 0.00011412159545630567, + "loss": 0.6958, + "step": 9700 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011373434878017296, + "loss": 0.6902, + "step": 9720 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011334710210404027, + "loss": 0.6796, + "step": 9740 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011295985542790757, + "loss": 0.6906, + "step": 9760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00011257260875177487, + "loss": 0.6882, + "step": 9780 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011218536207564217, + "loss": 0.6856, + "step": 9800 + }, + { + "epoch": 1.26, + "eval_loss": 0.6958709359169006, + "eval_runtime": 177.1197, + "eval_samples_per_second": 11.292, + "eval_steps_per_second": 1.411, + "step": 9800 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011179811539950949, + "loss": 0.681, + "step": 9820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011141086872337678, + "loss": 0.6824, + "step": 9840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00011102362204724409, + "loss": 0.6921, + "step": 9860 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001106363753711114, + "loss": 0.6862, + "step": 9880 + }, + { + "epoch": 1.27, + "learning_rate": 0.00011024912869497869, + "loss": 0.6869, + "step": 9900 + }, + { + "epoch": 1.27, + "learning_rate": 0.00010986188201884599, + "loss": 0.6867, + "step": 9920 + }, + { + "epoch": 1.27, + "learning_rate": 0.00010947463534271328, + "loss": 0.6885, + "step": 9940 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001090873886665806, + "loss": 0.6801, + "step": 9960 + }, + { + "epoch": 1.28, + "learning_rate": 0.00010870014199044791, + "loss": 0.684, + "step": 9980 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001083128953143152, + "loss": 0.6885, + "step": 10000 + }, + { + "epoch": 1.28, + "eval_loss": 0.6948391795158386, + "eval_runtime": 184.1668, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.357, + "step": 10000 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010792564863818251, + "loss": 0.685, + "step": 10020 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010753840196204981, + "loss": 0.6849, + "step": 10040 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010715115528591712, + "loss": 0.6901, + "step": 10060 + }, + { + "epoch": 1.29, + "learning_rate": 0.00010678327094359106, + "loss": 0.6926, + "step": 10080 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010639602426745837, + "loss": 0.6863, + "step": 10100 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010600877759132567, + "loss": 0.6846, + "step": 10120 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010562153091519296, + "loss": 0.6859, + "step": 10140 + }, + { + "epoch": 1.3, + "learning_rate": 0.00010523428423906027, + "loss": 0.6887, + "step": 10160 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010484703756292759, + "loss": 0.689, + "step": 10180 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010445979088679488, + "loss": 0.6795, + "step": 10200 + }, + { + "epoch": 1.31, + "eval_loss": 0.6949850916862488, + "eval_runtime": 177.1947, + "eval_samples_per_second": 11.287, + "eval_steps_per_second": 1.411, + "step": 10200 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010407254421066219, + "loss": 0.6874, + "step": 10220 + }, + { + "epoch": 1.31, + "learning_rate": 0.00010368529753452949, + "loss": 0.6875, + "step": 10240 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010329805085839678, + "loss": 0.6842, + "step": 10260 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010291080418226409, + "loss": 0.6842, + "step": 10280 + }, + { + "epoch": 1.32, + "learning_rate": 0.00010252355750613141, + "loss": 0.6834, + "step": 10300 + }, + { + "epoch": 1.32, + "learning_rate": 0.0001021363108299987, + "loss": 0.6894, + "step": 10320 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010174906415386601, + "loss": 0.6814, + "step": 10340 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010136181747773331, + "loss": 0.6731, + "step": 10360 + }, + { + "epoch": 1.33, + "learning_rate": 0.0001009745708016006, + "loss": 0.6955, + "step": 10380 + }, + { + "epoch": 1.33, + "learning_rate": 0.00010058732412546791, + "loss": 0.6884, + "step": 10400 + }, + { + "epoch": 1.33, + "eval_loss": 0.6947998404502869, + "eval_runtime": 183.6797, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.361, + "step": 10400 + }, + { + "epoch": 1.34, + "learning_rate": 0.00010020007744933523, + "loss": 0.6844, + "step": 10420 + }, + { + "epoch": 1.34, + "learning_rate": 9.981283077320252e-05, + "loss": 0.6795, + "step": 10440 + }, + { + "epoch": 1.34, + "learning_rate": 9.942558409706983e-05, + "loss": 0.6783, + "step": 10460 + }, + { + "epoch": 1.34, + "learning_rate": 9.903833742093713e-05, + "loss": 0.6951, + "step": 10480 + }, + { + "epoch": 1.35, + "learning_rate": 9.865109074480443e-05, + "loss": 0.6891, + "step": 10500 + }, + { + "epoch": 1.35, + "learning_rate": 9.826384406867173e-05, + "loss": 0.6899, + "step": 10520 + }, + { + "epoch": 1.35, + "learning_rate": 9.787659739253905e-05, + "loss": 0.6905, + "step": 10540 + }, + { + "epoch": 1.35, + "learning_rate": 9.748935071640634e-05, + "loss": 0.6786, + "step": 10560 + }, + { + "epoch": 1.36, + "learning_rate": 9.710210404027365e-05, + "loss": 0.6884, + "step": 10580 + }, + { + "epoch": 1.36, + "learning_rate": 9.671485736414096e-05, + "loss": 0.6757, + "step": 10600 + }, + { + "epoch": 1.36, + "eval_loss": 0.6941512227058411, + "eval_runtime": 179.4312, + "eval_samples_per_second": 11.146, + "eval_steps_per_second": 1.393, + "step": 10600 + }, + { + "epoch": 1.36, + "learning_rate": 9.632761068800825e-05, + "loss": 0.6844, + "step": 10620 + }, + { + "epoch": 1.36, + "learning_rate": 9.594036401187555e-05, + "loss": 0.6829, + "step": 10640 + }, + { + "epoch": 1.37, + "learning_rate": 9.555311733574287e-05, + "loss": 0.7007, + "step": 10660 + }, + { + "epoch": 1.37, + "learning_rate": 9.516587065961016e-05, + "loss": 0.6989, + "step": 10680 + }, + { + "epoch": 1.37, + "learning_rate": 9.477862398347747e-05, + "loss": 0.6783, + "step": 10700 + }, + { + "epoch": 1.37, + "learning_rate": 9.439137730734476e-05, + "loss": 0.6783, + "step": 10720 + }, + { + "epoch": 1.38, + "learning_rate": 9.400413063121207e-05, + "loss": 0.6879, + "step": 10740 + }, + { + "epoch": 1.38, + "learning_rate": 9.361688395507937e-05, + "loss": 0.6801, + "step": 10760 + }, + { + "epoch": 1.38, + "learning_rate": 9.322963727894667e-05, + "loss": 0.6786, + "step": 10780 + }, + { + "epoch": 1.39, + "learning_rate": 9.284239060281399e-05, + "loss": 0.6882, + "step": 10800 + }, + { + "epoch": 1.39, + "eval_loss": 0.6937060356140137, + "eval_runtime": 178.6714, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 10800 + }, + { + "epoch": 1.39, + "learning_rate": 9.245514392668129e-05, + "loss": 0.6796, + "step": 10820 + }, + { + "epoch": 1.39, + "learning_rate": 9.206789725054858e-05, + "loss": 0.6886, + "step": 10840 + }, + { + "epoch": 1.39, + "learning_rate": 9.168065057441589e-05, + "loss": 0.683, + "step": 10860 + }, + { + "epoch": 1.4, + "learning_rate": 9.129340389828321e-05, + "loss": 0.6786, + "step": 10880 + }, + { + "epoch": 1.4, + "learning_rate": 9.09061572221505e-05, + "loss": 0.6843, + "step": 10900 + }, + { + "epoch": 1.4, + "learning_rate": 9.05189105460178e-05, + "loss": 0.6795, + "step": 10920 + }, + { + "epoch": 1.4, + "learning_rate": 9.013166386988511e-05, + "loss": 0.6813, + "step": 10940 + }, + { + "epoch": 1.41, + "learning_rate": 8.97444171937524e-05, + "loss": 0.687, + "step": 10960 + }, + { + "epoch": 1.41, + "learning_rate": 8.935717051761971e-05, + "loss": 0.6872, + "step": 10980 + }, + { + "epoch": 1.41, + "learning_rate": 8.896992384148703e-05, + "loss": 0.6746, + "step": 11000 + }, + { + "epoch": 1.41, + "eval_loss": 0.6936533451080322, + "eval_runtime": 178.4177, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 11000 + }, + { + "epoch": 1.41, + "learning_rate": 8.858267716535432e-05, + "loss": 0.6747, + "step": 11020 + }, + { + "epoch": 1.42, + "learning_rate": 8.819543048922163e-05, + "loss": 0.6975, + "step": 11040 + }, + { + "epoch": 1.42, + "learning_rate": 8.780818381308893e-05, + "loss": 0.6806, + "step": 11060 + }, + { + "epoch": 1.42, + "learning_rate": 8.742093713695623e-05, + "loss": 0.6789, + "step": 11080 + }, + { + "epoch": 1.42, + "learning_rate": 8.703369046082353e-05, + "loss": 0.6851, + "step": 11100 + }, + { + "epoch": 1.43, + "learning_rate": 8.664644378469085e-05, + "loss": 0.6836, + "step": 11120 + }, + { + "epoch": 1.43, + "learning_rate": 8.625919710855814e-05, + "loss": 0.6891, + "step": 11140 + }, + { + "epoch": 1.43, + "learning_rate": 8.587195043242545e-05, + "loss": 0.6898, + "step": 11160 + }, + { + "epoch": 1.43, + "learning_rate": 8.548470375629275e-05, + "loss": 0.6798, + "step": 11180 + }, + { + "epoch": 1.44, + "learning_rate": 8.509745708016005e-05, + "loss": 0.6822, + "step": 11200 + }, + { + "epoch": 1.44, + "eval_loss": 0.6932746767997742, + "eval_runtime": 178.4279, + "eval_samples_per_second": 11.209, + "eval_steps_per_second": 1.401, + "step": 11200 + }, + { + "epoch": 1.44, + "learning_rate": 8.471021040402735e-05, + "loss": 0.6776, + "step": 11220 + }, + { + "epoch": 1.44, + "learning_rate": 8.432296372789467e-05, + "loss": 0.6835, + "step": 11240 + }, + { + "epoch": 1.44, + "learning_rate": 8.393571705176196e-05, + "loss": 0.6852, + "step": 11260 + }, + { + "epoch": 1.45, + "learning_rate": 8.354847037562927e-05, + "loss": 0.686, + "step": 11280 + }, + { + "epoch": 1.45, + "learning_rate": 8.316122369949657e-05, + "loss": 0.6884, + "step": 11300 + }, + { + "epoch": 1.45, + "learning_rate": 8.277397702336387e-05, + "loss": 0.6811, + "step": 11320 + }, + { + "epoch": 1.45, + "learning_rate": 8.238673034723117e-05, + "loss": 0.6751, + "step": 11340 + }, + { + "epoch": 1.46, + "learning_rate": 8.199948367109849e-05, + "loss": 0.6837, + "step": 11360 + }, + { + "epoch": 1.46, + "learning_rate": 8.161223699496578e-05, + "loss": 0.6839, + "step": 11380 + }, + { + "epoch": 1.46, + "learning_rate": 8.122499031883309e-05, + "loss": 0.6804, + "step": 11400 + }, + { + "epoch": 1.46, + "eval_loss": 0.6925450563430786, + "eval_runtime": 177.1737, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.411, + "step": 11400 + }, + { + "epoch": 1.46, + "learning_rate": 8.08377436427004e-05, + "loss": 0.6885, + "step": 11420 + }, + { + "epoch": 1.47, + "learning_rate": 8.045049696656769e-05, + "loss": 0.6907, + "step": 11440 + }, + { + "epoch": 1.47, + "learning_rate": 8.0063250290435e-05, + "loss": 0.6868, + "step": 11460 + }, + { + "epoch": 1.47, + "learning_rate": 7.967600361430231e-05, + "loss": 0.6945, + "step": 11480 + }, + { + "epoch": 1.47, + "learning_rate": 7.92887569381696e-05, + "loss": 0.6851, + "step": 11500 + }, + { + "epoch": 1.48, + "learning_rate": 7.890151026203691e-05, + "loss": 0.6878, + "step": 11520 + }, + { + "epoch": 1.48, + "learning_rate": 7.851426358590422e-05, + "loss": 0.6955, + "step": 11540 + }, + { + "epoch": 1.48, + "learning_rate": 7.812701690977151e-05, + "loss": 0.6783, + "step": 11560 + }, + { + "epoch": 1.49, + "learning_rate": 7.773977023363881e-05, + "loss": 0.6857, + "step": 11580 + }, + { + "epoch": 1.49, + "learning_rate": 7.735252355750613e-05, + "loss": 0.6828, + "step": 11600 + }, + { + "epoch": 1.49, + "eval_loss": 0.6924574971199036, + "eval_runtime": 177.3841, + "eval_samples_per_second": 11.275, + "eval_steps_per_second": 1.409, + "step": 11600 + }, + { + "epoch": 1.49, + "learning_rate": 7.696527688137343e-05, + "loss": 0.6796, + "step": 11620 + }, + { + "epoch": 1.49, + "learning_rate": 7.657803020524073e-05, + "loss": 0.6823, + "step": 11640 + }, + { + "epoch": 1.5, + "learning_rate": 7.619078352910804e-05, + "loss": 0.6854, + "step": 11660 + }, + { + "epoch": 1.5, + "learning_rate": 7.580353685297533e-05, + "loss": 0.6797, + "step": 11680 + }, + { + "epoch": 1.5, + "learning_rate": 7.541629017684265e-05, + "loss": 0.6811, + "step": 11700 + }, + { + "epoch": 1.5, + "learning_rate": 7.502904350070995e-05, + "loss": 0.6775, + "step": 11720 + }, + { + "epoch": 1.51, + "learning_rate": 7.464179682457725e-05, + "loss": 0.687, + "step": 11740 + }, + { + "epoch": 1.51, + "learning_rate": 7.425455014844455e-05, + "loss": 0.6859, + "step": 11760 + }, + { + "epoch": 1.51, + "learning_rate": 7.386730347231186e-05, + "loss": 0.683, + "step": 11780 + }, + { + "epoch": 1.51, + "learning_rate": 7.348005679617916e-05, + "loss": 0.6812, + "step": 11800 + }, + { + "epoch": 1.51, + "eval_loss": 0.692126452922821, + "eval_runtime": 182.938, + "eval_samples_per_second": 10.933, + "eval_steps_per_second": 1.367, + "step": 11800 + }, + { + "epoch": 1.52, + "learning_rate": 7.309281012004647e-05, + "loss": 0.688, + "step": 11820 + }, + { + "epoch": 1.52, + "learning_rate": 7.270556344391376e-05, + "loss": 0.6774, + "step": 11840 + }, + { + "epoch": 1.52, + "learning_rate": 7.231831676778107e-05, + "loss": 0.6806, + "step": 11860 + }, + { + "epoch": 1.52, + "learning_rate": 7.193107009164837e-05, + "loss": 0.6756, + "step": 11880 + }, + { + "epoch": 1.53, + "learning_rate": 7.154382341551568e-05, + "loss": 0.6856, + "step": 11900 + }, + { + "epoch": 1.53, + "learning_rate": 7.115657673938298e-05, + "loss": 0.6822, + "step": 11920 + }, + { + "epoch": 1.53, + "learning_rate": 7.076933006325029e-05, + "loss": 0.6769, + "step": 11940 + }, + { + "epoch": 1.53, + "learning_rate": 7.038208338711758e-05, + "loss": 0.6759, + "step": 11960 + }, + { + "epoch": 1.54, + "learning_rate": 6.999483671098489e-05, + "loss": 0.6854, + "step": 11980 + }, + { + "epoch": 1.54, + "learning_rate": 6.96075900348522e-05, + "loss": 0.6855, + "step": 12000 + }, + { + "epoch": 1.54, + "eval_loss": 0.6914573907852173, + "eval_runtime": 177.6919, + "eval_samples_per_second": 11.255, + "eval_steps_per_second": 1.407, + "step": 12000 + }, + { + "epoch": 1.54, + "learning_rate": 6.92203433587195e-05, + "loss": 0.6816, + "step": 12020 + }, + { + "epoch": 1.54, + "learning_rate": 6.88330966825868e-05, + "loss": 0.6801, + "step": 12040 + }, + { + "epoch": 1.55, + "learning_rate": 6.844585000645411e-05, + "loss": 0.6818, + "step": 12060 + }, + { + "epoch": 1.55, + "learning_rate": 6.80586033303214e-05, + "loss": 0.6808, + "step": 12080 + }, + { + "epoch": 1.55, + "learning_rate": 6.767135665418872e-05, + "loss": 0.6849, + "step": 12100 + }, + { + "epoch": 1.55, + "learning_rate": 6.728410997805601e-05, + "loss": 0.6902, + "step": 12120 + }, + { + "epoch": 1.56, + "learning_rate": 6.689686330192332e-05, + "loss": 0.6795, + "step": 12140 + }, + { + "epoch": 1.56, + "learning_rate": 6.652897895959726e-05, + "loss": 0.6783, + "step": 12160 + }, + { + "epoch": 1.56, + "learning_rate": 6.614173228346455e-05, + "loss": 0.6839, + "step": 12180 + }, + { + "epoch": 1.56, + "learning_rate": 6.575448560733186e-05, + "loss": 0.6875, + "step": 12200 + }, + { + "epoch": 1.56, + "eval_loss": 0.6915743947029114, + "eval_runtime": 177.7146, + "eval_samples_per_second": 11.254, + "eval_steps_per_second": 1.407, + "step": 12200 + }, + { + "epoch": 1.57, + "learning_rate": 6.536723893119917e-05, + "loss": 0.687, + "step": 12220 + }, + { + "epoch": 1.57, + "learning_rate": 6.497999225506647e-05, + "loss": 0.683, + "step": 12240 + }, + { + "epoch": 1.57, + "learning_rate": 6.459274557893378e-05, + "loss": 0.6796, + "step": 12260 + }, + { + "epoch": 1.57, + "learning_rate": 6.420549890280108e-05, + "loss": 0.6757, + "step": 12280 + }, + { + "epoch": 1.58, + "learning_rate": 6.381825222666838e-05, + "loss": 0.6901, + "step": 12300 + }, + { + "epoch": 1.58, + "learning_rate": 6.343100555053568e-05, + "loss": 0.6734, + "step": 12320 + }, + { + "epoch": 1.58, + "learning_rate": 6.304375887440299e-05, + "loss": 0.6863, + "step": 12340 + }, + { + "epoch": 1.59, + "learning_rate": 6.265651219827029e-05, + "loss": 0.6769, + "step": 12360 + }, + { + "epoch": 1.59, + "learning_rate": 6.22692655221376e-05, + "loss": 0.678, + "step": 12380 + }, + { + "epoch": 1.59, + "learning_rate": 6.18820188460049e-05, + "loss": 0.6934, + "step": 12400 + }, + { + "epoch": 1.59, + "eval_loss": 0.6910441517829895, + "eval_runtime": 177.091, + "eval_samples_per_second": 11.294, + "eval_steps_per_second": 1.412, + "step": 12400 + }, + { + "epoch": 1.59, + "learning_rate": 6.14947721698722e-05, + "loss": 0.6925, + "step": 12420 + }, + { + "epoch": 1.6, + "learning_rate": 6.11075254937395e-05, + "loss": 0.6797, + "step": 12440 + }, + { + "epoch": 1.6, + "learning_rate": 6.0720278817606814e-05, + "loss": 0.6839, + "step": 12460 + }, + { + "epoch": 1.6, + "learning_rate": 6.0333032141474113e-05, + "loss": 0.6825, + "step": 12480 + }, + { + "epoch": 1.6, + "learning_rate": 5.994578546534141e-05, + "loss": 0.6816, + "step": 12500 + }, + { + "epoch": 1.61, + "learning_rate": 5.9558538789208725e-05, + "loss": 0.6844, + "step": 12520 + }, + { + "epoch": 1.61, + "learning_rate": 5.9171292113076024e-05, + "loss": 0.6833, + "step": 12540 + }, + { + "epoch": 1.61, + "learning_rate": 5.878404543694332e-05, + "loss": 0.6827, + "step": 12560 + }, + { + "epoch": 1.61, + "learning_rate": 5.8396798760810635e-05, + "loss": 0.6802, + "step": 12580 + }, + { + "epoch": 1.62, + "learning_rate": 5.8009552084677934e-05, + "loss": 0.6837, + "step": 12600 + }, + { + "epoch": 1.62, + "eval_loss": 0.690994381904602, + "eval_runtime": 177.3691, + "eval_samples_per_second": 11.276, + "eval_steps_per_second": 1.409, + "step": 12600 + }, + { + "epoch": 1.62, + "learning_rate": 5.762230540854523e-05, + "loss": 0.6781, + "step": 12620 + }, + { + "epoch": 1.62, + "learning_rate": 5.7235058732412546e-05, + "loss": 0.6816, + "step": 12640 + }, + { + "epoch": 1.62, + "learning_rate": 5.6847812056279845e-05, + "loss": 0.6823, + "step": 12660 + }, + { + "epoch": 1.63, + "learning_rate": 5.6460565380147144e-05, + "loss": 0.6824, + "step": 12680 + }, + { + "epoch": 1.63, + "learning_rate": 5.6073318704014456e-05, + "loss": 0.6786, + "step": 12700 + }, + { + "epoch": 1.63, + "learning_rate": 5.5686072027881755e-05, + "loss": 0.6822, + "step": 12720 + }, + { + "epoch": 1.63, + "learning_rate": 5.5298825351749054e-05, + "loss": 0.6792, + "step": 12740 + }, + { + "epoch": 1.64, + "learning_rate": 5.491157867561637e-05, + "loss": 0.6797, + "step": 12760 + }, + { + "epoch": 1.64, + "learning_rate": 5.4524331999483666e-05, + "loss": 0.6814, + "step": 12780 + }, + { + "epoch": 1.64, + "learning_rate": 5.4137085323350965e-05, + "loss": 0.6827, + "step": 12800 + }, + { + "epoch": 1.64, + "eval_loss": 0.6906899809837341, + "eval_runtime": 177.4169, + "eval_samples_per_second": 11.273, + "eval_steps_per_second": 1.409, + "step": 12800 + }, + { + "epoch": 1.64, + "learning_rate": 5.374983864721828e-05, + "loss": 0.6776, + "step": 12820 + }, + { + "epoch": 1.65, + "learning_rate": 5.3362591971085576e-05, + "loss": 0.6877, + "step": 12840 + }, + { + "epoch": 1.65, + "learning_rate": 5.297534529495288e-05, + "loss": 0.6786, + "step": 12860 + }, + { + "epoch": 1.65, + "learning_rate": 5.258809861882019e-05, + "loss": 0.6853, + "step": 12880 + }, + { + "epoch": 1.65, + "learning_rate": 5.2200851942687486e-05, + "loss": 0.6843, + "step": 12900 + }, + { + "epoch": 1.66, + "learning_rate": 5.181360526655479e-05, + "loss": 0.6872, + "step": 12920 + }, + { + "epoch": 1.66, + "learning_rate": 5.14263585904221e-05, + "loss": 0.6862, + "step": 12940 + }, + { + "epoch": 1.66, + "learning_rate": 5.10391119142894e-05, + "loss": 0.6804, + "step": 12960 + }, + { + "epoch": 1.66, + "learning_rate": 5.06518652381567e-05, + "loss": 0.6786, + "step": 12980 + }, + { + "epoch": 1.67, + "learning_rate": 5.026461856202401e-05, + "loss": 0.6839, + "step": 13000 + }, + { + "epoch": 1.67, + "eval_loss": 0.6902768015861511, + "eval_runtime": 177.1355, + "eval_samples_per_second": 11.291, + "eval_steps_per_second": 1.411, + "step": 13000 + }, + { + "epoch": 1.67, + "learning_rate": 4.987737188589131e-05, + "loss": 0.682, + "step": 13020 + }, + { + "epoch": 1.67, + "learning_rate": 4.949012520975861e-05, + "loss": 0.6835, + "step": 13040 + }, + { + "epoch": 1.68, + "learning_rate": 4.910287853362592e-05, + "loss": 0.6822, + "step": 13060 + }, + { + "epoch": 1.68, + "learning_rate": 4.871563185749322e-05, + "loss": 0.6827, + "step": 13080 + }, + { + "epoch": 1.68, + "learning_rate": 4.8328385181360523e-05, + "loss": 0.6744, + "step": 13100 + }, + { + "epoch": 1.68, + "learning_rate": 4.794113850522782e-05, + "loss": 0.6817, + "step": 13120 + }, + { + "epoch": 1.69, + "learning_rate": 4.755389182909513e-05, + "loss": 0.6765, + "step": 13140 + }, + { + "epoch": 1.69, + "learning_rate": 4.7166645152962434e-05, + "loss": 0.682, + "step": 13160 + }, + { + "epoch": 1.69, + "learning_rate": 4.677939847682973e-05, + "loss": 0.6849, + "step": 13180 + }, + { + "epoch": 1.69, + "learning_rate": 4.639215180069704e-05, + "loss": 0.686, + "step": 13200 + }, + { + "epoch": 1.69, + "eval_loss": 0.6899891495704651, + "eval_runtime": 183.0495, + "eval_samples_per_second": 10.926, + "eval_steps_per_second": 1.366, + "step": 13200 + }, + { + "epoch": 1.7, + "learning_rate": 4.6004905124564344e-05, + "loss": 0.6787, + "step": 13220 + }, + { + "epoch": 1.7, + "learning_rate": 4.561765844843164e-05, + "loss": 0.6786, + "step": 13240 + }, + { + "epoch": 1.7, + "learning_rate": 4.523041177229895e-05, + "loss": 0.6913, + "step": 13260 + }, + { + "epoch": 1.7, + "learning_rate": 4.4843165096166255e-05, + "loss": 0.6721, + "step": 13280 + }, + { + "epoch": 1.71, + "learning_rate": 4.4455918420033554e-05, + "loss": 0.6783, + "step": 13300 + }, + { + "epoch": 1.71, + "learning_rate": 4.4068671743900866e-05, + "loss": 0.687, + "step": 13320 + }, + { + "epoch": 1.71, + "learning_rate": 4.3681425067768165e-05, + "loss": 0.6815, + "step": 13340 + }, + { + "epoch": 1.71, + "learning_rate": 4.3294178391635464e-05, + "loss": 0.6786, + "step": 13360 + }, + { + "epoch": 1.72, + "learning_rate": 4.290693171550278e-05, + "loss": 0.6845, + "step": 13380 + }, + { + "epoch": 1.72, + "learning_rate": 4.2519685039370076e-05, + "loss": 0.6817, + "step": 13400 + }, + { + "epoch": 1.72, + "eval_loss": 0.6897545456886292, + "eval_runtime": 177.4673, + "eval_samples_per_second": 11.27, + "eval_steps_per_second": 1.409, + "step": 13400 + }, + { + "epoch": 1.72, + "learning_rate": 4.2132438363237375e-05, + "loss": 0.6783, + "step": 13420 + }, + { + "epoch": 1.72, + "learning_rate": 4.174519168710469e-05, + "loss": 0.6775, + "step": 13440 + }, + { + "epoch": 1.73, + "learning_rate": 4.1357945010971986e-05, + "loss": 0.6752, + "step": 13460 + }, + { + "epoch": 1.73, + "learning_rate": 4.0970698334839285e-05, + "loss": 0.6732, + "step": 13480 + }, + { + "epoch": 1.73, + "learning_rate": 4.05834516587066e-05, + "loss": 0.6854, + "step": 13500 + }, + { + "epoch": 1.73, + "learning_rate": 4.0196204982573896e-05, + "loss": 0.6814, + "step": 13520 + }, + { + "epoch": 1.74, + "learning_rate": 3.9808958306441195e-05, + "loss": 0.6832, + "step": 13540 + }, + { + "epoch": 1.74, + "learning_rate": 3.942171163030851e-05, + "loss": 0.6764, + "step": 13560 + }, + { + "epoch": 1.74, + "learning_rate": 3.903446495417581e-05, + "loss": 0.6797, + "step": 13580 + }, + { + "epoch": 1.74, + "learning_rate": 3.8647218278043106e-05, + "loss": 0.6786, + "step": 13600 + }, + { + "epoch": 1.74, + "eval_loss": 0.6894007325172424, + "eval_runtime": 177.6931, + "eval_samples_per_second": 11.255, + "eval_steps_per_second": 1.407, + "step": 13600 + }, + { + "epoch": 1.75, + "learning_rate": 3.825997160191042e-05, + "loss": 0.6878, + "step": 13620 + }, + { + "epoch": 1.75, + "learning_rate": 3.787272492577772e-05, + "loss": 0.6847, + "step": 13640 + }, + { + "epoch": 1.75, + "learning_rate": 3.748547824964502e-05, + "loss": 0.6773, + "step": 13660 + }, + { + "epoch": 1.75, + "learning_rate": 3.709823157351232e-05, + "loss": 0.6723, + "step": 13680 + }, + { + "epoch": 1.76, + "learning_rate": 3.671098489737963e-05, + "loss": 0.6849, + "step": 13700 + }, + { + "epoch": 1.76, + "learning_rate": 3.6323738221246933e-05, + "loss": 0.6788, + "step": 13720 + }, + { + "epoch": 1.76, + "learning_rate": 3.593649154511423e-05, + "loss": 0.6851, + "step": 13740 + }, + { + "epoch": 1.76, + "learning_rate": 3.554924486898154e-05, + "loss": 0.6842, + "step": 13760 + }, + { + "epoch": 1.77, + "learning_rate": 3.5161998192848844e-05, + "loss": 0.6763, + "step": 13780 + }, + { + "epoch": 1.77, + "learning_rate": 3.477475151671614e-05, + "loss": 0.6795, + "step": 13800 + }, + { + "epoch": 1.77, + "eval_loss": 0.6892591714859009, + "eval_runtime": 178.329, + "eval_samples_per_second": 11.215, + "eval_steps_per_second": 1.402, + "step": 13800 + }, + { + "epoch": 1.77, + "learning_rate": 3.438750484058345e-05, + "loss": 0.6804, + "step": 13820 + }, + { + "epoch": 1.78, + "learning_rate": 3.4000258164450754e-05, + "loss": 0.6873, + "step": 13840 + }, + { + "epoch": 1.78, + "learning_rate": 3.361301148831805e-05, + "loss": 0.6783, + "step": 13860 + }, + { + "epoch": 1.78, + "learning_rate": 3.322576481218536e-05, + "loss": 0.6843, + "step": 13880 + }, + { + "epoch": 1.78, + "learning_rate": 3.2838518136052665e-05, + "loss": 0.6755, + "step": 13900 + }, + { + "epoch": 1.79, + "learning_rate": 3.2451271459919964e-05, + "loss": 0.684, + "step": 13920 + }, + { + "epoch": 1.79, + "learning_rate": 3.206402478378727e-05, + "loss": 0.6828, + "step": 13940 + }, + { + "epoch": 1.79, + "learning_rate": 3.167677810765457e-05, + "loss": 0.6771, + "step": 13960 + }, + { + "epoch": 1.79, + "learning_rate": 3.1289531431521874e-05, + "loss": 0.6818, + "step": 13980 + }, + { + "epoch": 1.8, + "learning_rate": 3.090228475538918e-05, + "loss": 0.6751, + "step": 14000 + }, + { + "epoch": 1.8, + "eval_loss": 0.6889638304710388, + "eval_runtime": 178.8668, + "eval_samples_per_second": 11.182, + "eval_steps_per_second": 1.398, + "step": 14000 + }, + { + "epoch": 1.8, + "learning_rate": 3.0515038079256482e-05, + "loss": 0.6794, + "step": 14020 + }, + { + "epoch": 1.8, + "learning_rate": 3.0127791403123788e-05, + "loss": 0.6652, + "step": 14040 + }, + { + "epoch": 1.8, + "learning_rate": 2.974054472699109e-05, + "loss": 0.6746, + "step": 14060 + }, + { + "epoch": 1.81, + "learning_rate": 2.9353298050858393e-05, + "loss": 0.6847, + "step": 14080 + }, + { + "epoch": 1.81, + "learning_rate": 2.89660513747257e-05, + "loss": 0.6733, + "step": 14100 + }, + { + "epoch": 1.81, + "learning_rate": 2.8578804698593004e-05, + "loss": 0.6794, + "step": 14120 + }, + { + "epoch": 1.81, + "learning_rate": 2.8191558022460303e-05, + "loss": 0.6748, + "step": 14140 + }, + { + "epoch": 1.82, + "learning_rate": 2.780431134632761e-05, + "loss": 0.6668, + "step": 14160 + }, + { + "epoch": 1.82, + "learning_rate": 2.7417064670194915e-05, + "loss": 0.6845, + "step": 14180 + }, + { + "epoch": 1.82, + "learning_rate": 2.7029817994062214e-05, + "loss": 0.6819, + "step": 14200 + }, + { + "epoch": 1.82, + "eval_loss": 0.68879234790802, + "eval_runtime": 178.3854, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.401, + "step": 14200 + }, + { + "epoch": 1.82, + "learning_rate": 2.6661933651736152e-05, + "loss": 0.6857, + "step": 14220 + }, + { + "epoch": 1.83, + "learning_rate": 2.6274686975603458e-05, + "loss": 0.6837, + "step": 14240 + }, + { + "epoch": 1.83, + "learning_rate": 2.588744029947076e-05, + "loss": 0.679, + "step": 14260 + }, + { + "epoch": 1.83, + "learning_rate": 2.5500193623338062e-05, + "loss": 0.6809, + "step": 14280 + }, + { + "epoch": 1.83, + "learning_rate": 2.5112946947205368e-05, + "loss": 0.683, + "step": 14300 + }, + { + "epoch": 1.84, + "learning_rate": 2.4725700271072674e-05, + "loss": 0.6787, + "step": 14320 + }, + { + "epoch": 1.84, + "learning_rate": 2.4338453594939973e-05, + "loss": 0.6842, + "step": 14340 + }, + { + "epoch": 1.84, + "learning_rate": 2.395120691880728e-05, + "loss": 0.682, + "step": 14360 + }, + { + "epoch": 1.84, + "learning_rate": 2.3563960242674584e-05, + "loss": 0.6751, + "step": 14380 + }, + { + "epoch": 1.85, + "learning_rate": 2.3176713566541883e-05, + "loss": 0.682, + "step": 14400 + }, + { + "epoch": 1.85, + "eval_loss": 0.6884602308273315, + "eval_runtime": 180.582, + "eval_samples_per_second": 11.075, + "eval_steps_per_second": 1.384, + "step": 14400 + }, + { + "epoch": 1.85, + "learning_rate": 2.278946689040919e-05, + "loss": 0.6728, + "step": 14420 + }, + { + "epoch": 1.85, + "learning_rate": 2.2402220214276495e-05, + "loss": 0.6839, + "step": 14440 + }, + { + "epoch": 1.85, + "learning_rate": 2.2014973538143794e-05, + "loss": 0.6828, + "step": 14460 + }, + { + "epoch": 1.86, + "learning_rate": 2.16277268620111e-05, + "loss": 0.6752, + "step": 14480 + }, + { + "epoch": 1.86, + "learning_rate": 2.1240480185878405e-05, + "loss": 0.682, + "step": 14500 + }, + { + "epoch": 1.86, + "learning_rate": 2.0853233509745704e-05, + "loss": 0.6802, + "step": 14520 + }, + { + "epoch": 1.86, + "learning_rate": 2.046598683361301e-05, + "loss": 0.6809, + "step": 14540 + }, + { + "epoch": 1.87, + "learning_rate": 2.0078740157480316e-05, + "loss": 0.6802, + "step": 14560 + }, + { + "epoch": 1.87, + "learning_rate": 1.9691493481347615e-05, + "loss": 0.6763, + "step": 14580 + }, + { + "epoch": 1.87, + "learning_rate": 1.930424680521492e-05, + "loss": 0.6876, + "step": 14600 + }, + { + "epoch": 1.87, + "eval_loss": 0.6882807612419128, + "eval_runtime": 177.6338, + "eval_samples_per_second": 11.259, + "eval_steps_per_second": 1.407, + "step": 14600 + }, + { + "epoch": 1.88, + "learning_rate": 1.8917000129082226e-05, + "loss": 0.6789, + "step": 14620 + }, + { + "epoch": 1.88, + "learning_rate": 1.8529753452949525e-05, + "loss": 0.6772, + "step": 14640 + }, + { + "epoch": 1.88, + "learning_rate": 1.814250677681683e-05, + "loss": 0.6783, + "step": 14660 + }, + { + "epoch": 1.88, + "learning_rate": 1.7755260100684133e-05, + "loss": 0.6829, + "step": 14680 + }, + { + "epoch": 1.89, + "learning_rate": 1.736801342455144e-05, + "loss": 0.6809, + "step": 14700 + }, + { + "epoch": 1.89, + "learning_rate": 1.698076674841874e-05, + "loss": 0.6766, + "step": 14720 + }, + { + "epoch": 1.89, + "learning_rate": 1.6593520072286043e-05, + "loss": 0.6813, + "step": 14740 + }, + { + "epoch": 1.89, + "learning_rate": 1.620627339615335e-05, + "loss": 0.6793, + "step": 14760 + }, + { + "epoch": 1.9, + "learning_rate": 1.581902672002065e-05, + "loss": 0.6736, + "step": 14780 + }, + { + "epoch": 1.9, + "learning_rate": 1.5431780043887957e-05, + "loss": 0.6842, + "step": 14800 + }, + { + "epoch": 1.9, + "eval_loss": 0.6880614757537842, + "eval_runtime": 177.7541, + "eval_samples_per_second": 11.251, + "eval_steps_per_second": 1.406, + "step": 14800 + }, + { + "epoch": 1.9, + "learning_rate": 1.5044533367755258e-05, + "loss": 0.682, + "step": 14820 + }, + { + "epoch": 1.9, + "learning_rate": 1.4657286691622562e-05, + "loss": 0.6776, + "step": 14840 + }, + { + "epoch": 1.91, + "learning_rate": 1.4270040015489866e-05, + "loss": 0.6776, + "step": 14860 + }, + { + "epoch": 1.91, + "learning_rate": 1.3882793339357168e-05, + "loss": 0.6717, + "step": 14880 + }, + { + "epoch": 1.91, + "learning_rate": 1.3495546663224472e-05, + "loss": 0.6871, + "step": 14900 + }, + { + "epoch": 1.91, + "learning_rate": 1.3108299987091776e-05, + "loss": 0.6872, + "step": 14920 + }, + { + "epoch": 1.92, + "learning_rate": 1.272105331095908e-05, + "loss": 0.6833, + "step": 14940 + }, + { + "epoch": 1.92, + "learning_rate": 1.2333806634826383e-05, + "loss": 0.6926, + "step": 14960 + }, + { + "epoch": 1.92, + "learning_rate": 1.1946559958693687e-05, + "loss": 0.6741, + "step": 14980 + }, + { + "epoch": 1.92, + "learning_rate": 1.1559313282560991e-05, + "loss": 0.6756, + "step": 15000 + }, + { + "epoch": 1.92, + "eval_loss": 0.6879639625549316, + "eval_runtime": 177.6222, + "eval_samples_per_second": 11.26, + "eval_steps_per_second": 1.407, + "step": 15000 + }, + { + "epoch": 1.93, + "learning_rate": 1.1172066606428293e-05, + "loss": 0.6803, + "step": 15020 + }, + { + "epoch": 1.93, + "learning_rate": 1.0784819930295599e-05, + "loss": 0.6753, + "step": 15040 + }, + { + "epoch": 1.93, + "learning_rate": 1.0397573254162901e-05, + "loss": 0.6749, + "step": 15060 + }, + { + "epoch": 1.93, + "learning_rate": 1.0010326578030204e-05, + "loss": 0.6792, + "step": 15080 + }, + { + "epoch": 1.94, + "learning_rate": 9.623079901897508e-06, + "loss": 0.6798, + "step": 15100 + }, + { + "epoch": 1.94, + "learning_rate": 9.235833225764812e-06, + "loss": 0.6838, + "step": 15120 + }, + { + "epoch": 1.94, + "learning_rate": 8.848586549632114e-06, + "loss": 0.6785, + "step": 15140 + }, + { + "epoch": 1.94, + "learning_rate": 8.461339873499418e-06, + "loss": 0.6858, + "step": 15160 + }, + { + "epoch": 1.95, + "learning_rate": 8.074093197366722e-06, + "loss": 0.6768, + "step": 15180 + }, + { + "epoch": 1.95, + "learning_rate": 7.686846521234025e-06, + "loss": 0.6891, + "step": 15200 + }, + { + "epoch": 1.95, + "eval_loss": 0.6878132224082947, + "eval_runtime": 177.6408, + "eval_samples_per_second": 11.259, + "eval_steps_per_second": 1.407, + "step": 15200 + } + ], + "max_steps": 15594, + "num_train_epochs": 2, + "total_flos": 1.975803247494955e+19, + "trial_name": null, + "trial_params": null +}