diff --git "a/checkpoint-36300/trainer_state.json" "b/checkpoint-36300/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-36300/trainer_state.json" @@ -0,0 +1,16714 @@ +{ + "best_metric": 1.391266107559204, + "best_model_checkpoint": "/mnt/data1/sheshuaijie/Output/CoT/Trained/chinese-llama-plus-13b_chinese-cot+belle_data1M+alpaca_gpt4+instinwild_ch+HC3_huma+HC3_chatGPT_0.0002/lora/checkpoint-36300", + "epoch": 1.9395688066041517, + "global_step": 36300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.1378941742383755e-06, + "loss": 1.4783, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 4.275788348476751e-06, + "loss": 1.4068, + "step": 40 + }, + { + "epoch": 0.0, + "eval_loss": 1.4569315910339355, + "eval_runtime": 49.7589, + "eval_samples_per_second": 60.291, + "eval_steps_per_second": 1.889, + "step": 50 + }, + { + "epoch": 0.0, + "learning_rate": 6.413682522715125e-06, + "loss": 1.4169, + "step": 60 + }, + { + "epoch": 0.0, + "learning_rate": 8.551576696953502e-06, + "loss": 1.4038, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 1.0689470871191876e-05, + "loss": 1.3771, + "step": 100 + }, + { + "epoch": 0.01, + "eval_loss": 1.4323030710220337, + "eval_runtime": 49.9207, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 1.282736504543025e-05, + "loss": 1.3784, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 1.4965259219668627e-05, + "loss": 1.4263, + "step": 140 + }, + { + "epoch": 0.01, + "eval_loss": 1.427394151687622, + "eval_runtime": 49.9421, + "eval_samples_per_second": 60.07, + "eval_steps_per_second": 1.882, + "step": 150 + }, + { + "epoch": 0.01, + "learning_rate": 1.7103153393907004e-05, + "loss": 1.3669, + "step": 160 + }, + { + "epoch": 0.01, + "learning_rate": 1.924104756814538e-05, + "loss": 1.4239, + "step": 180 + }, + { + "epoch": 0.01, + "learning_rate": 2.1378941742383753e-05, + "loss": 1.3658, + "step": 200 + }, + { + "epoch": 0.01, + "eval_loss": 1.4236927032470703, + "eval_runtime": 49.8789, + "eval_samples_per_second": 60.146, + "eval_steps_per_second": 1.885, + "step": 200 + }, + { + "epoch": 0.01, + "learning_rate": 2.351683591662213e-05, + "loss": 1.3947, + "step": 220 + }, + { + "epoch": 0.01, + "learning_rate": 2.56547300908605e-05, + "loss": 1.3993, + "step": 240 + }, + { + "epoch": 0.01, + "eval_loss": 1.4210280179977417, + "eval_runtime": 49.958, + "eval_samples_per_second": 60.05, + "eval_steps_per_second": 1.882, + "step": 250 + }, + { + "epoch": 0.01, + "learning_rate": 2.7792624265098877e-05, + "loss": 1.3769, + "step": 260 + }, + { + "epoch": 0.01, + "learning_rate": 2.9930518439337253e-05, + "loss": 1.3947, + "step": 280 + }, + { + "epoch": 0.02, + "learning_rate": 3.206841261357563e-05, + "loss": 1.4067, + "step": 300 + }, + { + "epoch": 0.02, + "eval_loss": 1.4191926717758179, + "eval_runtime": 49.9003, + "eval_samples_per_second": 60.12, + "eval_steps_per_second": 1.884, + "step": 300 + }, + { + "epoch": 0.02, + "learning_rate": 3.420630678781401e-05, + "loss": 1.3541, + "step": 320 + }, + { + "epoch": 0.02, + "learning_rate": 3.634420096205238e-05, + "loss": 1.3919, + "step": 340 + }, + { + "epoch": 0.02, + "eval_loss": 1.4174752235412598, + "eval_runtime": 49.9563, + "eval_samples_per_second": 60.053, + "eval_steps_per_second": 1.882, + "step": 350 + }, + { + "epoch": 0.02, + "learning_rate": 3.848209513629076e-05, + "loss": 1.3776, + "step": 360 + }, + { + "epoch": 0.02, + "learning_rate": 4.061998931052913e-05, + "loss": 1.4259, + "step": 380 + }, + { + "epoch": 0.02, + "learning_rate": 4.2757883484767506e-05, + "loss": 1.3679, + "step": 400 + }, + { + "epoch": 0.02, + "eval_loss": 1.4158518314361572, + "eval_runtime": 49.8513, + "eval_samples_per_second": 60.179, + "eval_steps_per_second": 1.886, + "step": 400 + }, + { + "epoch": 0.02, + "learning_rate": 4.4895777659005885e-05, + "loss": 1.3791, + "step": 420 + }, + { + "epoch": 0.02, + "learning_rate": 4.703367183324426e-05, + "loss": 1.3695, + "step": 440 + }, + { + "epoch": 0.02, + "eval_loss": 1.4145797491073608, + "eval_runtime": 49.8906, + "eval_samples_per_second": 60.132, + "eval_steps_per_second": 1.884, + "step": 450 + }, + { + "epoch": 0.02, + "learning_rate": 4.917156600748263e-05, + "loss": 1.3826, + "step": 460 + }, + { + "epoch": 0.03, + "learning_rate": 5.1309460181721e-05, + "loss": 1.3851, + "step": 480 + }, + { + "epoch": 0.03, + "learning_rate": 5.3447354355959376e-05, + "loss": 1.3888, + "step": 500 + }, + { + "epoch": 0.03, + "eval_loss": 1.4140371084213257, + "eval_runtime": 49.9418, + "eval_samples_per_second": 60.07, + "eval_steps_per_second": 1.882, + "step": 500 + }, + { + "epoch": 0.03, + "learning_rate": 5.5585248530197755e-05, + "loss": 1.3633, + "step": 520 + }, + { + "epoch": 0.03, + "learning_rate": 5.772314270443613e-05, + "loss": 1.4141, + "step": 540 + }, + { + "epoch": 0.03, + "eval_loss": 1.4142972230911255, + "eval_runtime": 49.9711, + "eval_samples_per_second": 60.035, + "eval_steps_per_second": 1.881, + "step": 550 + }, + { + "epoch": 0.03, + "learning_rate": 5.986103687867451e-05, + "loss": 1.3633, + "step": 560 + }, + { + "epoch": 0.03, + "learning_rate": 6.199893105291288e-05, + "loss": 1.3659, + "step": 580 + }, + { + "epoch": 0.03, + "learning_rate": 6.413682522715126e-05, + "loss": 1.4105, + "step": 600 + }, + { + "epoch": 0.03, + "eval_loss": 1.4129570722579956, + "eval_runtime": 49.9129, + "eval_samples_per_second": 60.105, + "eval_steps_per_second": 1.883, + "step": 600 + }, + { + "epoch": 0.03, + "learning_rate": 6.627471940138962e-05, + "loss": 1.3381, + "step": 620 + }, + { + "epoch": 0.03, + "learning_rate": 6.841261357562802e-05, + "loss": 1.3776, + "step": 640 + }, + { + "epoch": 0.03, + "eval_loss": 1.4125068187713623, + "eval_runtime": 49.9267, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 650 + }, + { + "epoch": 0.04, + "learning_rate": 7.05505077498664e-05, + "loss": 1.3704, + "step": 660 + }, + { + "epoch": 0.04, + "learning_rate": 7.268840192410476e-05, + "loss": 1.3755, + "step": 680 + }, + { + "epoch": 0.04, + "learning_rate": 7.482629609834314e-05, + "loss": 1.3866, + "step": 700 + }, + { + "epoch": 0.04, + "eval_loss": 1.411845088005066, + "eval_runtime": 49.8796, + "eval_samples_per_second": 60.145, + "eval_steps_per_second": 1.885, + "step": 700 + }, + { + "epoch": 0.04, + "learning_rate": 7.696419027258152e-05, + "loss": 1.3516, + "step": 720 + }, + { + "epoch": 0.04, + "learning_rate": 7.910208444681989e-05, + "loss": 1.3781, + "step": 740 + }, + { + "epoch": 0.04, + "eval_loss": 1.4121510982513428, + "eval_runtime": 49.9262, + "eval_samples_per_second": 60.089, + "eval_steps_per_second": 1.883, + "step": 750 + }, + { + "epoch": 0.04, + "learning_rate": 8.123997862105827e-05, + "loss": 1.3963, + "step": 760 + }, + { + "epoch": 0.04, + "learning_rate": 8.337787279529665e-05, + "loss": 1.3803, + "step": 780 + }, + { + "epoch": 0.04, + "learning_rate": 8.551576696953501e-05, + "loss": 1.3752, + "step": 800 + }, + { + "epoch": 0.04, + "eval_loss": 1.4118362665176392, + "eval_runtime": 49.9268, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 800 + }, + { + "epoch": 0.04, + "learning_rate": 8.765366114377339e-05, + "loss": 1.401, + "step": 820 + }, + { + "epoch": 0.04, + "learning_rate": 8.979155531801177e-05, + "loss": 1.3748, + "step": 840 + }, + { + "epoch": 0.05, + "eval_loss": 1.4110814332962036, + "eval_runtime": 49.9302, + "eval_samples_per_second": 60.084, + "eval_steps_per_second": 1.883, + "step": 850 + }, + { + "epoch": 0.05, + "learning_rate": 9.192944949225014e-05, + "loss": 1.3708, + "step": 860 + }, + { + "epoch": 0.05, + "learning_rate": 9.406734366648852e-05, + "loss": 1.408, + "step": 880 + }, + { + "epoch": 0.05, + "learning_rate": 9.62052378407269e-05, + "loss": 1.4088, + "step": 900 + }, + { + "epoch": 0.05, + "eval_loss": 1.4106462001800537, + "eval_runtime": 49.927, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 900 + }, + { + "epoch": 0.05, + "learning_rate": 9.834313201496526e-05, + "loss": 1.4126, + "step": 920 + }, + { + "epoch": 0.05, + "learning_rate": 0.00010048102618920363, + "loss": 1.3812, + "step": 940 + }, + { + "epoch": 0.05, + "eval_loss": 1.4106441736221313, + "eval_runtime": 49.9373, + "eval_samples_per_second": 60.075, + "eval_steps_per_second": 1.882, + "step": 950 + }, + { + "epoch": 0.05, + "learning_rate": 0.000102618920363442, + "loss": 1.3703, + "step": 960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00010475681453768039, + "loss": 1.3686, + "step": 980 + }, + { + "epoch": 0.05, + "learning_rate": 0.00010689470871191875, + "loss": 1.3659, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_loss": 1.4101723432540894, + "eval_runtime": 49.9126, + "eval_samples_per_second": 60.105, + "eval_steps_per_second": 1.883, + "step": 1000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00010903260288615713, + "loss": 1.4065, + "step": 1020 + }, + { + "epoch": 0.06, + "learning_rate": 0.00011117049706039551, + "loss": 1.3392, + "step": 1040 + }, + { + "epoch": 0.06, + "eval_loss": 1.410170555114746, + "eval_runtime": 49.9495, + "eval_samples_per_second": 60.061, + "eval_steps_per_second": 1.882, + "step": 1050 + }, + { + "epoch": 0.06, + "learning_rate": 0.00011330839123463388, + "loss": 1.3471, + "step": 1060 + }, + { + "epoch": 0.06, + "learning_rate": 0.00011544628540887226, + "loss": 1.3389, + "step": 1080 + }, + { + "epoch": 0.06, + "learning_rate": 0.00011758417958311063, + "loss": 1.4075, + "step": 1100 + }, + { + "epoch": 0.06, + "eval_loss": 1.4099453687667847, + "eval_runtime": 49.7526, + "eval_samples_per_second": 60.298, + "eval_steps_per_second": 1.889, + "step": 1100 + }, + { + "epoch": 0.06, + "learning_rate": 0.00011972207375734901, + "loss": 1.3654, + "step": 1120 + }, + { + "epoch": 0.06, + "learning_rate": 0.00012185996793158738, + "loss": 1.3927, + "step": 1140 + }, + { + "epoch": 0.06, + "eval_loss": 1.409054160118103, + "eval_runtime": 49.9406, + "eval_samples_per_second": 60.071, + "eval_steps_per_second": 1.882, + "step": 1150 + }, + { + "epoch": 0.06, + "learning_rate": 0.00012399786210582576, + "loss": 1.3952, + "step": 1160 + }, + { + "epoch": 0.06, + "learning_rate": 0.00012613575628006414, + "loss": 1.361, + "step": 1180 + }, + { + "epoch": 0.06, + "learning_rate": 0.00012827365045430252, + "loss": 1.3464, + "step": 1200 + }, + { + "epoch": 0.06, + "eval_loss": 1.408968448638916, + "eval_runtime": 49.8975, + "eval_samples_per_second": 60.123, + "eval_steps_per_second": 1.884, + "step": 1200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00013041154462854087, + "loss": 1.4216, + "step": 1220 + }, + { + "epoch": 0.07, + "learning_rate": 0.00013254943880277925, + "loss": 1.3608, + "step": 1240 + }, + { + "epoch": 0.07, + "eval_loss": 1.4092673063278198, + "eval_runtime": 49.923, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 1250 + }, + { + "epoch": 0.07, + "learning_rate": 0.00013468733297701766, + "loss": 1.375, + "step": 1260 + }, + { + "epoch": 0.07, + "learning_rate": 0.00013682522715125604, + "loss": 1.359, + "step": 1280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00013896312132549441, + "loss": 1.354, + "step": 1300 + }, + { + "epoch": 0.07, + "eval_loss": 1.4099416732788086, + "eval_runtime": 49.8935, + "eval_samples_per_second": 60.128, + "eval_steps_per_second": 1.884, + "step": 1300 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001411010154997328, + "loss": 1.3792, + "step": 1320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00014323890967397115, + "loss": 1.3666, + "step": 1340 + }, + { + "epoch": 0.07, + "eval_loss": 1.4085068702697754, + "eval_runtime": 49.9387, + "eval_samples_per_second": 60.074, + "eval_steps_per_second": 1.882, + "step": 1350 + }, + { + "epoch": 0.07, + "learning_rate": 0.00014537680384820953, + "loss": 1.3648, + "step": 1360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001475146980224479, + "loss": 1.3749, + "step": 1380 + }, + { + "epoch": 0.07, + "learning_rate": 0.00014965259219668628, + "loss": 1.3646, + "step": 1400 + }, + { + "epoch": 0.07, + "eval_loss": 1.4088290929794312, + "eval_runtime": 49.9166, + "eval_samples_per_second": 60.1, + "eval_steps_per_second": 1.883, + "step": 1400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00015179048637092466, + "loss": 1.3961, + "step": 1420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00015392838054516304, + "loss": 1.3936, + "step": 1440 + }, + { + "epoch": 0.08, + "eval_loss": 1.408731460571289, + "eval_runtime": 49.9201, + "eval_samples_per_second": 60.096, + "eval_steps_per_second": 1.883, + "step": 1450 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001560662747194014, + "loss": 1.3706, + "step": 1460 + }, + { + "epoch": 0.08, + "learning_rate": 0.00015820416889363977, + "loss": 1.3484, + "step": 1480 + }, + { + "epoch": 0.08, + "learning_rate": 0.00016034206306787815, + "loss": 1.3926, + "step": 1500 + }, + { + "epoch": 0.08, + "eval_loss": 1.4091012477874756, + "eval_runtime": 49.8678, + "eval_samples_per_second": 60.159, + "eval_steps_per_second": 1.885, + "step": 1500 + }, + { + "epoch": 0.08, + "learning_rate": 0.00016247995724211653, + "loss": 1.3689, + "step": 1520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001646178514163549, + "loss": 1.3209, + "step": 1540 + }, + { + "epoch": 0.08, + "eval_loss": 1.4094237089157104, + "eval_runtime": 49.9507, + "eval_samples_per_second": 60.059, + "eval_steps_per_second": 1.882, + "step": 1550 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001667557455905933, + "loss": 1.36, + "step": 1560 + }, + { + "epoch": 0.08, + "learning_rate": 0.00016889363976483164, + "loss": 1.3839, + "step": 1580 + }, + { + "epoch": 0.09, + "learning_rate": 0.00017103153393907002, + "loss": 1.3898, + "step": 1600 + }, + { + "epoch": 0.09, + "eval_loss": 1.4086543321609497, + "eval_runtime": 49.8979, + "eval_samples_per_second": 60.123, + "eval_steps_per_second": 1.884, + "step": 1600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001731694281133084, + "loss": 1.359, + "step": 1620 + }, + { + "epoch": 0.09, + "learning_rate": 0.00017530732228754678, + "loss": 1.3937, + "step": 1640 + }, + { + "epoch": 0.09, + "eval_loss": 1.4081257581710815, + "eval_runtime": 49.9291, + "eval_samples_per_second": 60.085, + "eval_steps_per_second": 1.883, + "step": 1650 + }, + { + "epoch": 0.09, + "learning_rate": 0.00017744521646178516, + "loss": 1.3744, + "step": 1660 + }, + { + "epoch": 0.09, + "learning_rate": 0.00017958311063602354, + "loss": 1.3738, + "step": 1680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018172100481026192, + "loss": 1.3589, + "step": 1700 + }, + { + "epoch": 0.09, + "eval_loss": 1.4081943035125732, + "eval_runtime": 49.8275, + "eval_samples_per_second": 60.208, + "eval_steps_per_second": 1.887, + "step": 1700 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018385889898450027, + "loss": 1.396, + "step": 1720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018599679315873865, + "loss": 1.3503, + "step": 1740 + }, + { + "epoch": 0.09, + "eval_loss": 1.4091119766235352, + "eval_runtime": 49.9328, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 1750 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018813468733297703, + "loss": 1.3775, + "step": 1760 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001902725815072154, + "loss": 1.4194, + "step": 1780 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001924104756814538, + "loss": 1.3852, + "step": 1800 + }, + { + "epoch": 0.1, + "eval_loss": 1.4091593027114868, + "eval_runtime": 49.8852, + "eval_samples_per_second": 60.138, + "eval_steps_per_second": 1.884, + "step": 1800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019454836985569217, + "loss": 1.389, + "step": 1820 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019668626402993052, + "loss": 1.3605, + "step": 1840 + }, + { + "epoch": 0.1, + "eval_loss": 1.409116268157959, + "eval_runtime": 49.9249, + "eval_samples_per_second": 60.09, + "eval_steps_per_second": 1.883, + "step": 1850 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001988241582041689, + "loss": 1.3712, + "step": 1860 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019999516686401974, + "loss": 1.3866, + "step": 1880 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019998442656184132, + "loss": 1.4269, + "step": 1900 + }, + { + "epoch": 0.1, + "eval_loss": 1.4094537496566772, + "eval_runtime": 49.8922, + "eval_samples_per_second": 60.13, + "eval_steps_per_second": 1.884, + "step": 1900 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019997368625966294, + "loss": 1.3613, + "step": 1920 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019996294595748452, + "loss": 1.3664, + "step": 1940 + }, + { + "epoch": 0.1, + "eval_loss": 1.409331202507019, + "eval_runtime": 49.9367, + "eval_samples_per_second": 60.076, + "eval_steps_per_second": 1.882, + "step": 1950 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001999522056553061, + "loss": 1.3729, + "step": 1960 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019994146535312772, + "loss": 1.4009, + "step": 1980 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001999307250509493, + "loss": 1.373, + "step": 2000 + }, + { + "epoch": 0.11, + "eval_loss": 1.4088104963302612, + "eval_runtime": 49.91, + "eval_samples_per_second": 60.108, + "eval_steps_per_second": 1.883, + "step": 2000 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019991998474877092, + "loss": 1.4176, + "step": 2020 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001999092444465925, + "loss": 1.3791, + "step": 2040 + }, + { + "epoch": 0.11, + "eval_loss": 1.4090261459350586, + "eval_runtime": 49.9322, + "eval_samples_per_second": 60.082, + "eval_steps_per_second": 1.883, + "step": 2050 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019989850414441412, + "loss": 1.3044, + "step": 2060 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001998877638422357, + "loss": 1.3906, + "step": 2080 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001998770235400573, + "loss": 1.3836, + "step": 2100 + }, + { + "epoch": 0.11, + "eval_loss": 1.4082199335098267, + "eval_runtime": 49.9048, + "eval_samples_per_second": 60.114, + "eval_steps_per_second": 1.884, + "step": 2100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001998662832378789, + "loss": 1.3652, + "step": 2120 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001998555429357005, + "loss": 1.3387, + "step": 2140 + }, + { + "epoch": 0.11, + "eval_loss": 1.40771484375, + "eval_runtime": 49.9276, + "eval_samples_per_second": 60.087, + "eval_steps_per_second": 1.883, + "step": 2150 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001998448026335221, + "loss": 1.3131, + "step": 2160 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001998340623313437, + "loss": 1.3854, + "step": 2180 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019982332202916528, + "loss": 1.3528, + "step": 2200 + }, + { + "epoch": 0.12, + "eval_loss": 1.4079679250717163, + "eval_runtime": 49.8878, + "eval_samples_per_second": 60.135, + "eval_steps_per_second": 1.884, + "step": 2200 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001998125817269869, + "loss": 1.3839, + "step": 2220 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019980184142480848, + "loss": 1.3316, + "step": 2240 + }, + { + "epoch": 0.12, + "eval_loss": 1.4077861309051514, + "eval_runtime": 49.923, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 2250 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001997911011226301, + "loss": 1.4085, + "step": 2260 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019978036082045168, + "loss": 1.4099, + "step": 2280 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019976962051827327, + "loss": 1.3541, + "step": 2300 + }, + { + "epoch": 0.12, + "eval_loss": 1.4079844951629639, + "eval_runtime": 49.8734, + "eval_samples_per_second": 60.152, + "eval_steps_per_second": 1.885, + "step": 2300 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019975888021609488, + "loss": 1.3947, + "step": 2320 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019974813991391647, + "loss": 1.3833, + "step": 2340 + }, + { + "epoch": 0.13, + "eval_loss": 1.4079506397247314, + "eval_runtime": 49.9735, + "eval_samples_per_second": 60.032, + "eval_steps_per_second": 1.881, + "step": 2350 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019973739961173808, + "loss": 1.3379, + "step": 2360 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001997266593095597, + "loss": 1.3665, + "step": 2380 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019971591900738128, + "loss": 1.3738, + "step": 2400 + }, + { + "epoch": 0.13, + "eval_loss": 1.4071325063705444, + "eval_runtime": 49.8925, + "eval_samples_per_second": 60.129, + "eval_steps_per_second": 1.884, + "step": 2400 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001997051787052029, + "loss": 1.3443, + "step": 2420 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019969443840302448, + "loss": 1.3573, + "step": 2440 + }, + { + "epoch": 0.13, + "eval_loss": 1.4071837663650513, + "eval_runtime": 49.9695, + "eval_samples_per_second": 60.037, + "eval_steps_per_second": 1.881, + "step": 2450 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019968369810084607, + "loss": 1.3498, + "step": 2460 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019967295779866768, + "loss": 1.3629, + "step": 2480 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019966221749648927, + "loss": 1.3734, + "step": 2500 + }, + { + "epoch": 0.13, + "eval_loss": 1.407920241355896, + "eval_runtime": 49.8461, + "eval_samples_per_second": 60.185, + "eval_steps_per_second": 1.886, + "step": 2500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019965147719431088, + "loss": 1.3626, + "step": 2520 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019964073689213247, + "loss": 1.3672, + "step": 2540 + }, + { + "epoch": 0.14, + "eval_loss": 1.4076189994812012, + "eval_runtime": 49.9079, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 2550 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019962999658995408, + "loss": 1.395, + "step": 2560 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019961925628777567, + "loss": 1.3628, + "step": 2580 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019960851598559726, + "loss": 1.3804, + "step": 2600 + }, + { + "epoch": 0.14, + "eval_loss": 1.4080127477645874, + "eval_runtime": 49.891, + "eval_samples_per_second": 60.131, + "eval_steps_per_second": 1.884, + "step": 2600 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019959777568341887, + "loss": 1.3555, + "step": 2620 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019958703538124046, + "loss": 1.3725, + "step": 2640 + }, + { + "epoch": 0.14, + "eval_loss": 1.4076472520828247, + "eval_runtime": 49.9139, + "eval_samples_per_second": 60.104, + "eval_steps_per_second": 1.883, + "step": 2650 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019957629507906207, + "loss": 1.3865, + "step": 2660 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019956555477688366, + "loss": 1.3725, + "step": 2680 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019955481447470524, + "loss": 1.3654, + "step": 2700 + }, + { + "epoch": 0.14, + "eval_loss": 1.4083791971206665, + "eval_runtime": 49.871, + "eval_samples_per_second": 60.155, + "eval_steps_per_second": 1.885, + "step": 2700 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019954407417252686, + "loss": 1.4114, + "step": 2720 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019953333387034844, + "loss": 1.3674, + "step": 2740 + }, + { + "epoch": 0.15, + "eval_loss": 1.407729148864746, + "eval_runtime": 49.8891, + "eval_samples_per_second": 60.133, + "eval_steps_per_second": 1.884, + "step": 2750 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019952259356817006, + "loss": 1.3409, + "step": 2760 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019951185326599164, + "loss": 1.3983, + "step": 2780 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019950111296381323, + "loss": 1.4033, + "step": 2800 + }, + { + "epoch": 0.15, + "eval_loss": 1.4079221487045288, + "eval_runtime": 49.8679, + "eval_samples_per_second": 60.159, + "eval_steps_per_second": 1.885, + "step": 2800 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019949037266163484, + "loss": 1.3811, + "step": 2820 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019947963235945643, + "loss": 1.3869, + "step": 2840 + }, + { + "epoch": 0.15, + "eval_loss": 1.4070159196853638, + "eval_runtime": 49.8875, + "eval_samples_per_second": 60.135, + "eval_steps_per_second": 1.884, + "step": 2850 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019946889205727804, + "loss": 1.3576, + "step": 2860 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019945815175509963, + "loss": 1.3872, + "step": 2880 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019944741145292122, + "loss": 1.3753, + "step": 2900 + }, + { + "epoch": 0.15, + "eval_loss": 1.4078199863433838, + "eval_runtime": 49.9188, + "eval_samples_per_second": 60.098, + "eval_steps_per_second": 1.883, + "step": 2900 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019943667115074283, + "loss": 1.357, + "step": 2920 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019942646786367337, + "loss": 1.3749, + "step": 2940 + }, + { + "epoch": 0.16, + "eval_loss": 1.4070653915405273, + "eval_runtime": 49.9325, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 2950 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019941572756149495, + "loss": 1.3302, + "step": 2960 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019940498725931657, + "loss": 1.3303, + "step": 2980 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019939424695713815, + "loss": 1.3634, + "step": 3000 + }, + { + "epoch": 0.16, + "eval_loss": 1.4064579010009766, + "eval_runtime": 49.8564, + "eval_samples_per_second": 60.173, + "eval_steps_per_second": 1.885, + "step": 3000 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019938350665495974, + "loss": 1.3598, + "step": 3020 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019937276635278135, + "loss": 1.3385, + "step": 3040 + }, + { + "epoch": 0.16, + "eval_loss": 1.4077314138412476, + "eval_runtime": 49.9, + "eval_samples_per_second": 60.12, + "eval_steps_per_second": 1.884, + "step": 3050 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019936202605060294, + "loss": 1.3799, + "step": 3060 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019935128574842455, + "loss": 1.3893, + "step": 3080 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019934054544624614, + "loss": 1.3544, + "step": 3100 + }, + { + "epoch": 0.17, + "eval_loss": 1.4063854217529297, + "eval_runtime": 49.8543, + "eval_samples_per_second": 60.175, + "eval_steps_per_second": 1.885, + "step": 3100 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019932980514406772, + "loss": 1.3689, + "step": 3120 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019931906484188934, + "loss": 1.3925, + "step": 3140 + }, + { + "epoch": 0.17, + "eval_loss": 1.4067975282669067, + "eval_runtime": 49.9106, + "eval_samples_per_second": 60.107, + "eval_steps_per_second": 1.883, + "step": 3150 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019930832453971092, + "loss": 1.3536, + "step": 3160 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019929758423753254, + "loss": 1.3813, + "step": 3180 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019928684393535412, + "loss": 1.357, + "step": 3200 + }, + { + "epoch": 0.17, + "eval_loss": 1.4065169095993042, + "eval_runtime": 49.8559, + "eval_samples_per_second": 60.173, + "eval_steps_per_second": 1.885, + "step": 3200 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019927610363317574, + "loss": 1.3642, + "step": 3220 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019926536333099732, + "loss": 1.4115, + "step": 3240 + }, + { + "epoch": 0.17, + "eval_loss": 1.4065427780151367, + "eval_runtime": 49.9229, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 3250 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001992546230288189, + "loss": 1.37, + "step": 3260 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019924388272664052, + "loss": 1.3934, + "step": 3280 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001992331424244621, + "loss": 1.358, + "step": 3300 + }, + { + "epoch": 0.18, + "eval_loss": 1.4061527252197266, + "eval_runtime": 49.8613, + "eval_samples_per_second": 60.167, + "eval_steps_per_second": 1.885, + "step": 3300 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019922240212228372, + "loss": 1.4362, + "step": 3320 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001992116618201053, + "loss": 1.3564, + "step": 3340 + }, + { + "epoch": 0.18, + "eval_loss": 1.4055869579315186, + "eval_runtime": 49.9231, + "eval_samples_per_second": 60.092, + "eval_steps_per_second": 1.883, + "step": 3350 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001992009215179269, + "loss": 1.3623, + "step": 3360 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001991901812157485, + "loss": 1.3585, + "step": 3380 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001991794409135701, + "loss": 1.3873, + "step": 3400 + }, + { + "epoch": 0.18, + "eval_loss": 1.4059674739837646, + "eval_runtime": 49.8966, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 3400 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001991687006113917, + "loss": 1.3368, + "step": 3420 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001991579603092133, + "loss": 1.3468, + "step": 3440 + }, + { + "epoch": 0.18, + "eval_loss": 1.405554175376892, + "eval_runtime": 49.9447, + "eval_samples_per_second": 60.066, + "eval_steps_per_second": 1.882, + "step": 3450 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001991472200070349, + "loss": 1.3725, + "step": 3460 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019913647970485652, + "loss": 1.3846, + "step": 3480 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001991257394026781, + "loss": 1.4053, + "step": 3500 + }, + { + "epoch": 0.19, + "eval_loss": 1.405627727508545, + "eval_runtime": 49.8796, + "eval_samples_per_second": 60.145, + "eval_steps_per_second": 1.885, + "step": 3500 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001991149991004997, + "loss": 1.3681, + "step": 3520 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001991042587983213, + "loss": 1.3415, + "step": 3540 + }, + { + "epoch": 0.19, + "eval_loss": 1.4059925079345703, + "eval_runtime": 49.9311, + "eval_samples_per_second": 60.083, + "eval_steps_per_second": 1.883, + "step": 3550 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001990935184961429, + "loss": 1.3109, + "step": 3560 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001990827781939645, + "loss": 1.38, + "step": 3580 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001990720378917861, + "loss": 1.3641, + "step": 3600 + }, + { + "epoch": 0.19, + "eval_loss": 1.4058605432510376, + "eval_runtime": 49.9312, + "eval_samples_per_second": 60.083, + "eval_steps_per_second": 1.883, + "step": 3600 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001990612975896077, + "loss": 1.3655, + "step": 3620 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001990505572874293, + "loss": 1.3605, + "step": 3640 + }, + { + "epoch": 0.2, + "eval_loss": 1.4055465459823608, + "eval_runtime": 49.9566, + "eval_samples_per_second": 60.052, + "eval_steps_per_second": 1.882, + "step": 3650 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019903981698525088, + "loss": 1.386, + "step": 3660 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001990290766830725, + "loss": 1.3529, + "step": 3680 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019901833638089408, + "loss": 1.3937, + "step": 3700 + }, + { + "epoch": 0.2, + "eval_loss": 1.4052441120147705, + "eval_runtime": 49.9264, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 3700 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001990075960787157, + "loss": 1.3474, + "step": 3720 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019899685577653728, + "loss": 1.3422, + "step": 3740 + }, + { + "epoch": 0.2, + "eval_loss": 1.404908537864685, + "eval_runtime": 49.9154, + "eval_samples_per_second": 60.102, + "eval_steps_per_second": 1.883, + "step": 3750 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019898611547435887, + "loss": 1.341, + "step": 3760 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019897537517218048, + "loss": 1.3448, + "step": 3780 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019896463487000207, + "loss": 1.3649, + "step": 3800 + }, + { + "epoch": 0.2, + "eval_loss": 1.4054372310638428, + "eval_runtime": 49.8885, + "eval_samples_per_second": 60.134, + "eval_steps_per_second": 1.884, + "step": 3800 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019895389456782368, + "loss": 1.3718, + "step": 3820 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019894315426564527, + "loss": 1.3241, + "step": 3840 + }, + { + "epoch": 0.21, + "eval_loss": 1.4053229093551636, + "eval_runtime": 49.9033, + "eval_samples_per_second": 60.116, + "eval_steps_per_second": 1.884, + "step": 3850 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019893241396346686, + "loss": 1.3723, + "step": 3860 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019892167366128847, + "loss": 1.4031, + "step": 3880 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019891093335911006, + "loss": 1.3731, + "step": 3900 + }, + { + "epoch": 0.21, + "eval_loss": 1.405073881149292, + "eval_runtime": 49.8804, + "eval_samples_per_second": 60.144, + "eval_steps_per_second": 1.885, + "step": 3900 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019890019305693167, + "loss": 1.3892, + "step": 3920 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019888945275475326, + "loss": 1.3396, + "step": 3940 + }, + { + "epoch": 0.21, + "eval_loss": 1.4052538871765137, + "eval_runtime": 49.9053, + "eval_samples_per_second": 60.114, + "eval_steps_per_second": 1.884, + "step": 3950 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019887871245257484, + "loss": 1.3367, + "step": 3960 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019886797215039646, + "loss": 1.3634, + "step": 3980 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019885723184821804, + "loss": 1.3573, + "step": 4000 + }, + { + "epoch": 0.21, + "eval_loss": 1.4054239988327026, + "eval_runtime": 49.8607, + "eval_samples_per_second": 60.168, + "eval_steps_per_second": 1.885, + "step": 4000 + }, + { + "epoch": 0.21, + "learning_rate": 0.00019884649154603966, + "loss": 1.374, + "step": 4020 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019883575124386124, + "loss": 1.3274, + "step": 4040 + }, + { + "epoch": 0.22, + "eval_loss": 1.4054758548736572, + "eval_runtime": 49.9644, + "eval_samples_per_second": 60.043, + "eval_steps_per_second": 1.881, + "step": 4050 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019882501094168283, + "loss": 1.3344, + "step": 4060 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019881427063950447, + "loss": 1.4042, + "step": 4080 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019880353033732606, + "loss": 1.3657, + "step": 4100 + }, + { + "epoch": 0.22, + "eval_loss": 1.405276894569397, + "eval_runtime": 49.8923, + "eval_samples_per_second": 60.129, + "eval_steps_per_second": 1.884, + "step": 4100 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019879279003514767, + "loss": 1.4087, + "step": 4120 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019878204973296926, + "loss": 1.3755, + "step": 4140 + }, + { + "epoch": 0.22, + "eval_loss": 1.405236005783081, + "eval_runtime": 49.9488, + "eval_samples_per_second": 60.061, + "eval_steps_per_second": 1.882, + "step": 4150 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019877130943079084, + "loss": 1.3296, + "step": 4160 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019876056912861246, + "loss": 1.3761, + "step": 4180 + }, + { + "epoch": 0.22, + "learning_rate": 0.00019874982882643404, + "loss": 1.3692, + "step": 4200 + }, + { + "epoch": 0.22, + "eval_loss": 1.4048938751220703, + "eval_runtime": 49.8671, + "eval_samples_per_second": 60.16, + "eval_steps_per_second": 1.885, + "step": 4200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019873908852425566, + "loss": 1.4076, + "step": 4220 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019872834822207724, + "loss": 1.3583, + "step": 4240 + }, + { + "epoch": 0.23, + "eval_loss": 1.405096411705017, + "eval_runtime": 49.9436, + "eval_samples_per_second": 60.068, + "eval_steps_per_second": 1.882, + "step": 4250 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019871760791989883, + "loss": 1.3269, + "step": 4260 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019870686761772044, + "loss": 1.3954, + "step": 4280 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019869612731554203, + "loss": 1.3728, + "step": 4300 + }, + { + "epoch": 0.23, + "eval_loss": 1.4051318168640137, + "eval_runtime": 49.9242, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 4300 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019868538701336364, + "loss": 1.3818, + "step": 4320 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019867464671118523, + "loss": 1.3199, + "step": 4340 + }, + { + "epoch": 0.23, + "eval_loss": 1.40473473072052, + "eval_runtime": 49.9828, + "eval_samples_per_second": 60.021, + "eval_steps_per_second": 1.881, + "step": 4350 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019866390640900682, + "loss": 1.3595, + "step": 4360 + }, + { + "epoch": 0.23, + "learning_rate": 0.00019865316610682843, + "loss": 1.3777, + "step": 4380 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019864242580465002, + "loss": 1.3655, + "step": 4400 + }, + { + "epoch": 0.24, + "eval_loss": 1.4049779176712036, + "eval_runtime": 49.9218, + "eval_samples_per_second": 60.094, + "eval_steps_per_second": 1.883, + "step": 4400 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019863168550247163, + "loss": 1.4016, + "step": 4420 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019862094520029322, + "loss": 1.3786, + "step": 4440 + }, + { + "epoch": 0.24, + "eval_loss": 1.4047762155532837, + "eval_runtime": 49.9779, + "eval_samples_per_second": 60.027, + "eval_steps_per_second": 1.881, + "step": 4450 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001986102048981148, + "loss": 1.3883, + "step": 4460 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019859946459593642, + "loss": 1.3878, + "step": 4480 + }, + { + "epoch": 0.24, + "learning_rate": 0.000198588724293758, + "loss": 1.3631, + "step": 4500 + }, + { + "epoch": 0.24, + "eval_loss": 1.40459406375885, + "eval_runtime": 49.974, + "eval_samples_per_second": 60.031, + "eval_steps_per_second": 1.881, + "step": 4500 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019857798399157962, + "loss": 1.358, + "step": 4520 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001985672436894012, + "loss": 1.3183, + "step": 4540 + }, + { + "epoch": 0.24, + "eval_loss": 1.4047383069992065, + "eval_runtime": 49.97, + "eval_samples_per_second": 60.036, + "eval_steps_per_second": 1.881, + "step": 4550 + }, + { + "epoch": 0.24, + "learning_rate": 0.00019855650338722282, + "loss": 1.3327, + "step": 4560 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001985457630850444, + "loss": 1.341, + "step": 4580 + }, + { + "epoch": 0.25, + "learning_rate": 0.000198535022782866, + "loss": 1.3578, + "step": 4600 + }, + { + "epoch": 0.25, + "eval_loss": 1.4043097496032715, + "eval_runtime": 49.9216, + "eval_samples_per_second": 60.094, + "eval_steps_per_second": 1.883, + "step": 4600 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001985242824806876, + "loss": 1.3672, + "step": 4620 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001985135421785092, + "loss": 1.3401, + "step": 4640 + }, + { + "epoch": 0.25, + "eval_loss": 1.4051434993743896, + "eval_runtime": 49.9422, + "eval_samples_per_second": 60.069, + "eval_steps_per_second": 1.882, + "step": 4650 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001985028018763308, + "loss": 1.3944, + "step": 4660 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001984920615741524, + "loss": 1.3479, + "step": 4680 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019848132127197398, + "loss": 1.3385, + "step": 4700 + }, + { + "epoch": 0.25, + "eval_loss": 1.404536485671997, + "eval_runtime": 49.9602, + "eval_samples_per_second": 60.048, + "eval_steps_per_second": 1.881, + "step": 4700 + }, + { + "epoch": 0.25, + "learning_rate": 0.00019847058096979562, + "loss": 1.3895, + "step": 4720 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001984598406676172, + "loss": 1.3638, + "step": 4740 + }, + { + "epoch": 0.25, + "eval_loss": 1.4047491550445557, + "eval_runtime": 49.9567, + "eval_samples_per_second": 60.052, + "eval_steps_per_second": 1.882, + "step": 4750 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001984491003654388, + "loss": 1.3291, + "step": 4760 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001984383600632604, + "loss": 1.3697, + "step": 4780 + }, + { + "epoch": 0.26, + "learning_rate": 0.000198427619761082, + "loss": 1.3794, + "step": 4800 + }, + { + "epoch": 0.26, + "eval_loss": 1.4042659997940063, + "eval_runtime": 49.9395, + "eval_samples_per_second": 60.073, + "eval_steps_per_second": 1.882, + "step": 4800 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001984168794589036, + "loss": 1.3412, + "step": 4820 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001984061391567252, + "loss": 1.3181, + "step": 4840 + }, + { + "epoch": 0.26, + "eval_loss": 1.4045872688293457, + "eval_runtime": 49.9714, + "eval_samples_per_second": 60.034, + "eval_steps_per_second": 1.881, + "step": 4850 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019839539885454678, + "loss": 1.3461, + "step": 4860 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001983846585523684, + "loss": 1.3832, + "step": 4880 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019837391825018998, + "loss": 1.3673, + "step": 4900 + }, + { + "epoch": 0.26, + "eval_loss": 1.4039243459701538, + "eval_runtime": 49.9353, + "eval_samples_per_second": 60.078, + "eval_steps_per_second": 1.882, + "step": 4900 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001983631779480116, + "loss": 1.3735, + "step": 4920 + }, + { + "epoch": 0.26, + "learning_rate": 0.00019835243764583318, + "loss": 1.3978, + "step": 4940 + }, + { + "epoch": 0.26, + "eval_loss": 1.403692364692688, + "eval_runtime": 49.9655, + "eval_samples_per_second": 60.041, + "eval_steps_per_second": 1.881, + "step": 4950 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019834169734365476, + "loss": 1.3966, + "step": 4960 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019833095704147638, + "loss": 1.3589, + "step": 4980 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019832021673929796, + "loss": 1.3525, + "step": 5000 + }, + { + "epoch": 0.27, + "eval_loss": 1.4040664434432983, + "eval_runtime": 49.9201, + "eval_samples_per_second": 60.096, + "eval_steps_per_second": 1.883, + "step": 5000 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019830947643711958, + "loss": 1.3461, + "step": 5020 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019829873613494116, + "loss": 1.3542, + "step": 5040 + }, + { + "epoch": 0.27, + "eval_loss": 1.4037916660308838, + "eval_runtime": 49.9879, + "eval_samples_per_second": 60.015, + "eval_steps_per_second": 1.88, + "step": 5050 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019828799583276278, + "loss": 1.332, + "step": 5060 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019827725553058436, + "loss": 1.3698, + "step": 5080 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019826651522840595, + "loss": 1.3322, + "step": 5100 + }, + { + "epoch": 0.27, + "eval_loss": 1.403656244277954, + "eval_runtime": 49.9297, + "eval_samples_per_second": 60.085, + "eval_steps_per_second": 1.883, + "step": 5100 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019825577492622756, + "loss": 1.3376, + "step": 5120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00019824503462404915, + "loss": 1.356, + "step": 5140 + }, + { + "epoch": 0.28, + "eval_loss": 1.4035804271697998, + "eval_runtime": 49.9959, + "eval_samples_per_second": 60.005, + "eval_steps_per_second": 1.88, + "step": 5150 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019823429432187076, + "loss": 1.3228, + "step": 5160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019822355401969235, + "loss": 1.3784, + "step": 5180 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019821281371751394, + "loss": 1.3658, + "step": 5200 + }, + { + "epoch": 0.28, + "eval_loss": 1.403926134109497, + "eval_runtime": 49.9437, + "eval_samples_per_second": 60.068, + "eval_steps_per_second": 1.882, + "step": 5200 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019820207341533555, + "loss": 1.362, + "step": 5220 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019819133311315714, + "loss": 1.3549, + "step": 5240 + }, + { + "epoch": 0.28, + "eval_loss": 1.4029345512390137, + "eval_runtime": 49.9979, + "eval_samples_per_second": 60.003, + "eval_steps_per_second": 1.88, + "step": 5250 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019818059281097875, + "loss": 1.3809, + "step": 5260 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019816985250880034, + "loss": 1.3573, + "step": 5280 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019815911220662192, + "loss": 1.361, + "step": 5300 + }, + { + "epoch": 0.28, + "eval_loss": 1.4024875164031982, + "eval_runtime": 49.9453, + "eval_samples_per_second": 60.066, + "eval_steps_per_second": 1.882, + "step": 5300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00019814837190444354, + "loss": 1.3267, + "step": 5320 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019813763160226515, + "loss": 1.3765, + "step": 5340 + }, + { + "epoch": 0.29, + "eval_loss": 1.4023877382278442, + "eval_runtime": 49.9712, + "eval_samples_per_second": 60.035, + "eval_steps_per_second": 1.881, + "step": 5350 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019812689130008674, + "loss": 1.3563, + "step": 5360 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019811615099790835, + "loss": 1.3212, + "step": 5380 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019810541069572994, + "loss": 1.3078, + "step": 5400 + }, + { + "epoch": 0.29, + "eval_loss": 1.4039820432662964, + "eval_runtime": 49.8899, + "eval_samples_per_second": 60.132, + "eval_steps_per_second": 1.884, + "step": 5400 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019809467039355155, + "loss": 1.3441, + "step": 5420 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019808393009137314, + "loss": 1.3772, + "step": 5440 + }, + { + "epoch": 0.29, + "eval_loss": 1.402974247932434, + "eval_runtime": 49.9826, + "eval_samples_per_second": 60.021, + "eval_steps_per_second": 1.881, + "step": 5450 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019807318978919475, + "loss": 1.3703, + "step": 5460 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019806244948701634, + "loss": 1.3942, + "step": 5480 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019805170918483792, + "loss": 1.3586, + "step": 5500 + }, + { + "epoch": 0.29, + "eval_loss": 1.4036117792129517, + "eval_runtime": 49.921, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 5500 + }, + { + "epoch": 0.29, + "learning_rate": 0.00019804096888265954, + "loss": 1.3272, + "step": 5520 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019803022858048112, + "loss": 1.3751, + "step": 5540 + }, + { + "epoch": 0.3, + "eval_loss": 1.4024755954742432, + "eval_runtime": 49.9843, + "eval_samples_per_second": 60.019, + "eval_steps_per_second": 1.881, + "step": 5550 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019801948827830274, + "loss": 1.3445, + "step": 5560 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019800874797612432, + "loss": 1.3859, + "step": 5580 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001979980076739459, + "loss": 1.3665, + "step": 5600 + }, + { + "epoch": 0.3, + "eval_loss": 1.4023313522338867, + "eval_runtime": 49.929, + "eval_samples_per_second": 60.085, + "eval_steps_per_second": 1.883, + "step": 5600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019798726737176752, + "loss": 1.3446, + "step": 5620 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001979765270695891, + "loss": 1.3938, + "step": 5640 + }, + { + "epoch": 0.3, + "eval_loss": 1.4025416374206543, + "eval_runtime": 49.9691, + "eval_samples_per_second": 60.037, + "eval_steps_per_second": 1.881, + "step": 5650 + }, + { + "epoch": 0.3, + "learning_rate": 0.00019796578676741072, + "loss": 1.3895, + "step": 5660 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001979550464652323, + "loss": 1.4207, + "step": 5680 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001979443061630539, + "loss": 1.3675, + "step": 5700 + }, + { + "epoch": 0.3, + "eval_loss": 1.4026676416397095, + "eval_runtime": 49.9822, + "eval_samples_per_second": 60.021, + "eval_steps_per_second": 1.881, + "step": 5700 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001979335658608755, + "loss": 1.3534, + "step": 5720 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001979228255586971, + "loss": 1.3145, + "step": 5740 + }, + { + "epoch": 0.31, + "eval_loss": 1.4028642177581787, + "eval_runtime": 50.0235, + "eval_samples_per_second": 59.972, + "eval_steps_per_second": 1.879, + "step": 5750 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001979120852565187, + "loss": 1.328, + "step": 5760 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001979013449543403, + "loss": 1.3546, + "step": 5780 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019789060465216188, + "loss": 1.3228, + "step": 5800 + }, + { + "epoch": 0.31, + "eval_loss": 1.4030640125274658, + "eval_runtime": 49.9206, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 5800 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001978798643499835, + "loss": 1.334, + "step": 5820 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019786912404780508, + "loss": 1.3502, + "step": 5840 + }, + { + "epoch": 0.31, + "eval_loss": 1.402596354484558, + "eval_runtime": 49.9769, + "eval_samples_per_second": 60.028, + "eval_steps_per_second": 1.881, + "step": 5850 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001978583837456267, + "loss": 1.4087, + "step": 5860 + }, + { + "epoch": 0.31, + "learning_rate": 0.00019784764344344828, + "loss": 1.3516, + "step": 5880 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019783690314126987, + "loss": 1.3684, + "step": 5900 + }, + { + "epoch": 0.32, + "eval_loss": 1.4028434753417969, + "eval_runtime": 49.9166, + "eval_samples_per_second": 60.1, + "eval_steps_per_second": 1.883, + "step": 5900 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019782616283909148, + "loss": 1.3747, + "step": 5920 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019781542253691307, + "loss": 1.3603, + "step": 5940 + }, + { + "epoch": 0.32, + "eval_loss": 1.4027584791183472, + "eval_runtime": 50.4225, + "eval_samples_per_second": 59.497, + "eval_steps_per_second": 1.864, + "step": 5950 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019780468223473468, + "loss": 1.3718, + "step": 5960 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001977939419325563, + "loss": 1.3695, + "step": 5980 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019778320163037788, + "loss": 1.3384, + "step": 6000 + }, + { + "epoch": 0.32, + "eval_loss": 1.4024461507797241, + "eval_runtime": 51.9592, + "eval_samples_per_second": 57.738, + "eval_steps_per_second": 1.809, + "step": 6000 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001977724613281995, + "loss": 1.3768, + "step": 6020 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019776172102602108, + "loss": 1.3948, + "step": 6040 + }, + { + "epoch": 0.32, + "eval_loss": 1.4024426937103271, + "eval_runtime": 51.9479, + "eval_samples_per_second": 57.75, + "eval_steps_per_second": 1.81, + "step": 6050 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001977509807238427, + "loss": 1.3148, + "step": 6060 + }, + { + "epoch": 0.32, + "learning_rate": 0.00019774024042166428, + "loss": 1.321, + "step": 6080 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019772950011948587, + "loss": 1.3558, + "step": 6100 + }, + { + "epoch": 0.33, + "eval_loss": 1.4021098613739014, + "eval_runtime": 49.9338, + "eval_samples_per_second": 60.08, + "eval_steps_per_second": 1.882, + "step": 6100 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019771875981730748, + "loss": 1.356, + "step": 6120 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019770801951512907, + "loss": 1.3726, + "step": 6140 + }, + { + "epoch": 0.33, + "eval_loss": 1.4019304513931274, + "eval_runtime": 49.9492, + "eval_samples_per_second": 60.061, + "eval_steps_per_second": 1.882, + "step": 6150 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019769727921295068, + "loss": 1.3608, + "step": 6160 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019768653891077227, + "loss": 1.3549, + "step": 6180 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019767579860859385, + "loss": 1.38, + "step": 6200 + }, + { + "epoch": 0.33, + "eval_loss": 1.402077317237854, + "eval_runtime": 49.9509, + "eval_samples_per_second": 60.059, + "eval_steps_per_second": 1.882, + "step": 6200 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019766505830641547, + "loss": 1.4072, + "step": 6220 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019765431800423705, + "loss": 1.3653, + "step": 6240 + }, + { + "epoch": 0.33, + "eval_loss": 1.402234673500061, + "eval_runtime": 49.9544, + "eval_samples_per_second": 60.055, + "eval_steps_per_second": 1.882, + "step": 6250 + }, + { + "epoch": 0.33, + "learning_rate": 0.00019764357770205867, + "loss": 1.3435, + "step": 6260 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019763283739988025, + "loss": 1.368, + "step": 6280 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019762209709770184, + "loss": 1.3639, + "step": 6300 + }, + { + "epoch": 0.34, + "eval_loss": 1.401593804359436, + "eval_runtime": 49.9096, + "eval_samples_per_second": 60.109, + "eval_steps_per_second": 1.883, + "step": 6300 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019761135679552345, + "loss": 1.3387, + "step": 6320 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019760061649334504, + "loss": 1.3709, + "step": 6340 + }, + { + "epoch": 0.34, + "eval_loss": 1.4018782377243042, + "eval_runtime": 49.9438, + "eval_samples_per_second": 60.068, + "eval_steps_per_second": 1.882, + "step": 6350 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019758987619116665, + "loss": 1.328, + "step": 6360 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019757913588898824, + "loss": 1.3254, + "step": 6380 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019756839558680985, + "loss": 1.3593, + "step": 6400 + }, + { + "epoch": 0.34, + "eval_loss": 1.4019062519073486, + "eval_runtime": 49.9103, + "eval_samples_per_second": 60.108, + "eval_steps_per_second": 1.883, + "step": 6400 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019755765528463144, + "loss": 1.39, + "step": 6420 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019754691498245303, + "loss": 1.366, + "step": 6440 + }, + { + "epoch": 0.34, + "eval_loss": 1.4018573760986328, + "eval_runtime": 52.8919, + "eval_samples_per_second": 56.719, + "eval_steps_per_second": 1.777, + "step": 6450 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019753617468027464, + "loss": 1.3986, + "step": 6460 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019752543437809623, + "loss": 1.3728, + "step": 6480 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019751469407591784, + "loss": 1.3584, + "step": 6500 + }, + { + "epoch": 0.35, + "eval_loss": 1.4021949768066406, + "eval_runtime": 51.6993, + "eval_samples_per_second": 58.028, + "eval_steps_per_second": 1.818, + "step": 6500 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019750395377373943, + "loss": 1.3515, + "step": 6520 + }, + { + "epoch": 0.35, + "learning_rate": 0.000197493213471561, + "loss": 1.3587, + "step": 6540 + }, + { + "epoch": 0.35, + "eval_loss": 1.4026806354522705, + "eval_runtime": 51.9928, + "eval_samples_per_second": 57.7, + "eval_steps_per_second": 1.808, + "step": 6550 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019748247316938263, + "loss": 1.3223, + "step": 6560 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001974717328672042, + "loss": 1.3452, + "step": 6580 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019746099256502583, + "loss": 1.3525, + "step": 6600 + }, + { + "epoch": 0.35, + "eval_loss": 1.402047872543335, + "eval_runtime": 52.9322, + "eval_samples_per_second": 56.676, + "eval_steps_per_second": 1.776, + "step": 6600 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019745025226284744, + "loss": 1.3546, + "step": 6620 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019743951196066903, + "loss": 1.3808, + "step": 6640 + }, + { + "epoch": 0.36, + "eval_loss": 1.4020296335220337, + "eval_runtime": 52.9894, + "eval_samples_per_second": 56.615, + "eval_steps_per_second": 1.774, + "step": 6650 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019742877165849064, + "loss": 1.3466, + "step": 6660 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019741803135631223, + "loss": 1.3595, + "step": 6680 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001974072910541338, + "loss": 1.3788, + "step": 6700 + }, + { + "epoch": 0.36, + "eval_loss": 1.4023902416229248, + "eval_runtime": 53.0092, + "eval_samples_per_second": 56.594, + "eval_steps_per_second": 1.773, + "step": 6700 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019739655075195543, + "loss": 1.3787, + "step": 6720 + }, + { + "epoch": 0.36, + "learning_rate": 0.000197385810449777, + "loss": 1.3484, + "step": 6740 + }, + { + "epoch": 0.36, + "eval_loss": 1.401914358139038, + "eval_runtime": 52.9334, + "eval_samples_per_second": 56.675, + "eval_steps_per_second": 1.776, + "step": 6750 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019737507014759863, + "loss": 1.3412, + "step": 6760 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001973643298454202, + "loss": 1.3274, + "step": 6780 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001973535895432418, + "loss": 1.4129, + "step": 6800 + }, + { + "epoch": 0.36, + "eval_loss": 1.4013642072677612, + "eval_runtime": 52.933, + "eval_samples_per_second": 56.675, + "eval_steps_per_second": 1.776, + "step": 6800 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001973428492410634, + "loss": 1.3889, + "step": 6820 + }, + { + "epoch": 0.37, + "learning_rate": 0.000197332108938885, + "loss": 1.3519, + "step": 6840 + }, + { + "epoch": 0.37, + "eval_loss": 1.4011621475219727, + "eval_runtime": 52.9903, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 1.774, + "step": 6850 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001973213686367066, + "loss": 1.3953, + "step": 6860 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001973106283345282, + "loss": 1.3546, + "step": 6880 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001972998880323498, + "loss": 1.3495, + "step": 6900 + }, + { + "epoch": 0.37, + "eval_loss": 1.4015971422195435, + "eval_runtime": 52.8958, + "eval_samples_per_second": 56.715, + "eval_steps_per_second": 1.777, + "step": 6900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001972891477301714, + "loss": 1.3357, + "step": 6920 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019727840742799299, + "loss": 1.3114, + "step": 6940 + }, + { + "epoch": 0.37, + "eval_loss": 1.4012014865875244, + "eval_runtime": 52.9438, + "eval_samples_per_second": 56.664, + "eval_steps_per_second": 1.775, + "step": 6950 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001972676671258146, + "loss": 1.3446, + "step": 6960 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019725692682363619, + "loss": 1.3983, + "step": 6980 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001972461865214578, + "loss": 1.3564, + "step": 7000 + }, + { + "epoch": 0.37, + "eval_loss": 1.400884747505188, + "eval_runtime": 52.8746, + "eval_samples_per_second": 56.738, + "eval_steps_per_second": 1.778, + "step": 7000 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019723544621927939, + "loss": 1.326, + "step": 7020 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019722470591710097, + "loss": 1.3127, + "step": 7040 + }, + { + "epoch": 0.38, + "eval_loss": 1.4015973806381226, + "eval_runtime": 52.9938, + "eval_samples_per_second": 56.61, + "eval_steps_per_second": 1.774, + "step": 7050 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019721396561492259, + "loss": 1.3858, + "step": 7060 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019720322531274417, + "loss": 1.3599, + "step": 7080 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019719248501056579, + "loss": 1.3681, + "step": 7100 + }, + { + "epoch": 0.38, + "eval_loss": 1.4011207818984985, + "eval_runtime": 52.9195, + "eval_samples_per_second": 56.69, + "eval_steps_per_second": 1.776, + "step": 7100 + }, + { + "epoch": 0.38, + "learning_rate": 0.00019718228172349632, + "loss": 1.371, + "step": 7120 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001971715414213179, + "loss": 1.3459, + "step": 7140 + }, + { + "epoch": 0.38, + "eval_loss": 1.4016060829162598, + "eval_runtime": 52.9177, + "eval_samples_per_second": 56.692, + "eval_steps_per_second": 1.776, + "step": 7150 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001971608011191395, + "loss": 1.3567, + "step": 7160 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001971500608169611, + "loss": 1.3795, + "step": 7180 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001971393205147827, + "loss": 1.3499, + "step": 7200 + }, + { + "epoch": 0.38, + "eval_loss": 1.4015876054763794, + "eval_runtime": 52.8119, + "eval_samples_per_second": 56.805, + "eval_steps_per_second": 1.78, + "step": 7200 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001971285802126043, + "loss": 1.3308, + "step": 7220 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001971178399104259, + "loss": 1.3811, + "step": 7240 + }, + { + "epoch": 0.39, + "eval_loss": 1.4017484188079834, + "eval_runtime": 52.962, + "eval_samples_per_second": 56.644, + "eval_steps_per_second": 1.775, + "step": 7250 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019710709960824748, + "loss": 1.3567, + "step": 7260 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001970963593060691, + "loss": 1.3609, + "step": 7280 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019708561900389068, + "loss": 1.3585, + "step": 7300 + }, + { + "epoch": 0.39, + "eval_loss": 1.4017462730407715, + "eval_runtime": 52.879, + "eval_samples_per_second": 56.733, + "eval_steps_per_second": 1.778, + "step": 7300 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001970748787017123, + "loss": 1.37, + "step": 7320 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019706413839953388, + "loss": 1.3404, + "step": 7340 + }, + { + "epoch": 0.39, + "eval_loss": 1.4020615816116333, + "eval_runtime": 52.9825, + "eval_samples_per_second": 56.623, + "eval_steps_per_second": 1.774, + "step": 7350 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019705339809735547, + "loss": 1.3307, + "step": 7360 + }, + { + "epoch": 0.39, + "learning_rate": 0.00019704265779517708, + "loss": 1.3871, + "step": 7380 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019703191749299867, + "loss": 1.3283, + "step": 7400 + }, + { + "epoch": 0.4, + "eval_loss": 1.4005995988845825, + "eval_runtime": 52.8907, + "eval_samples_per_second": 56.721, + "eval_steps_per_second": 1.777, + "step": 7400 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019702117719082028, + "loss": 1.3592, + "step": 7420 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019701043688864187, + "loss": 1.373, + "step": 7440 + }, + { + "epoch": 0.4, + "eval_loss": 1.4015244245529175, + "eval_runtime": 52.8915, + "eval_samples_per_second": 56.72, + "eval_steps_per_second": 1.777, + "step": 7450 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019699969658646345, + "loss": 1.3815, + "step": 7460 + }, + { + "epoch": 0.4, + "learning_rate": 0.000196989493299394, + "loss": 1.3838, + "step": 7480 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019697875299721558, + "loss": 1.3316, + "step": 7500 + }, + { + "epoch": 0.4, + "eval_loss": 1.4018954038619995, + "eval_runtime": 52.9361, + "eval_samples_per_second": 56.672, + "eval_steps_per_second": 1.776, + "step": 7500 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019696801269503716, + "loss": 1.3692, + "step": 7520 + }, + { + "epoch": 0.4, + "learning_rate": 0.00019695727239285878, + "loss": 1.3496, + "step": 7540 + }, + { + "epoch": 0.4, + "eval_loss": 1.401475429534912, + "eval_runtime": 52.9712, + "eval_samples_per_second": 56.635, + "eval_steps_per_second": 1.775, + "step": 7550 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001969465320906804, + "loss": 1.3507, + "step": 7560 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019693579178850198, + "loss": 1.3649, + "step": 7580 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001969250514863236, + "loss": 1.3968, + "step": 7600 + }, + { + "epoch": 0.41, + "eval_loss": 1.4009819030761719, + "eval_runtime": 52.8433, + "eval_samples_per_second": 56.772, + "eval_steps_per_second": 1.779, + "step": 7600 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019691431118414518, + "loss": 1.4017, + "step": 7620 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001969035708819668, + "loss": 1.3512, + "step": 7640 + }, + { + "epoch": 0.41, + "eval_loss": 1.400898814201355, + "eval_runtime": 52.9951, + "eval_samples_per_second": 56.609, + "eval_steps_per_second": 1.774, + "step": 7650 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019689283057978838, + "loss": 1.336, + "step": 7660 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019688209027761, + "loss": 1.3587, + "step": 7680 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019687134997543158, + "loss": 1.3624, + "step": 7700 + }, + { + "epoch": 0.41, + "eval_loss": 1.4006584882736206, + "eval_runtime": 52.9022, + "eval_samples_per_second": 56.708, + "eval_steps_per_second": 1.777, + "step": 7700 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019686060967325316, + "loss": 1.3288, + "step": 7720 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019684986937107478, + "loss": 1.406, + "step": 7740 + }, + { + "epoch": 0.41, + "eval_loss": 1.4009158611297607, + "eval_runtime": 52.9599, + "eval_samples_per_second": 56.647, + "eval_steps_per_second": 1.775, + "step": 7750 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019683912906889636, + "loss": 1.3257, + "step": 7760 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019682838876671798, + "loss": 1.3483, + "step": 7780 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019681764846453956, + "loss": 1.3366, + "step": 7800 + }, + { + "epoch": 0.42, + "eval_loss": 1.4006359577178955, + "eval_runtime": 52.9157, + "eval_samples_per_second": 56.694, + "eval_steps_per_second": 1.776, + "step": 7800 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019680690816236115, + "loss": 1.3871, + "step": 7820 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019679616786018276, + "loss": 1.361, + "step": 7840 + }, + { + "epoch": 0.42, + "eval_loss": 1.3996506929397583, + "eval_runtime": 52.9799, + "eval_samples_per_second": 56.625, + "eval_steps_per_second": 1.774, + "step": 7850 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019678542755800435, + "loss": 1.3471, + "step": 7860 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019677468725582596, + "loss": 1.3318, + "step": 7880 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019676394695364755, + "loss": 1.3516, + "step": 7900 + }, + { + "epoch": 0.42, + "eval_loss": 1.4007470607757568, + "eval_runtime": 52.9141, + "eval_samples_per_second": 56.696, + "eval_steps_per_second": 1.776, + "step": 7900 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019675320665146914, + "loss": 1.3489, + "step": 7920 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019674246634929075, + "loss": 1.3629, + "step": 7940 + }, + { + "epoch": 0.42, + "eval_loss": 1.4007574319839478, + "eval_runtime": 52.8992, + "eval_samples_per_second": 56.712, + "eval_steps_per_second": 1.777, + "step": 7950 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019673172604711234, + "loss": 1.3947, + "step": 7960 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019672098574493395, + "loss": 1.3369, + "step": 7980 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019671024544275554, + "loss": 1.3895, + "step": 8000 + }, + { + "epoch": 0.43, + "eval_loss": 1.4001998901367188, + "eval_runtime": 52.9172, + "eval_samples_per_second": 56.692, + "eval_steps_per_second": 1.776, + "step": 8000 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019669950514057712, + "loss": 1.373, + "step": 8020 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019668876483839874, + "loss": 1.3558, + "step": 8040 + }, + { + "epoch": 0.43, + "eval_loss": 1.4004127979278564, + "eval_runtime": 52.8315, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 1.779, + "step": 8050 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019667802453622032, + "loss": 1.4025, + "step": 8060 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019666728423404194, + "loss": 1.3267, + "step": 8080 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019665654393186352, + "loss": 1.335, + "step": 8100 + }, + { + "epoch": 0.43, + "eval_loss": 1.400840163230896, + "eval_runtime": 52.8995, + "eval_samples_per_second": 56.711, + "eval_steps_per_second": 1.777, + "step": 8100 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001966458036296851, + "loss": 1.3523, + "step": 8120 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019663506332750672, + "loss": 1.3295, + "step": 8140 + }, + { + "epoch": 0.44, + "eval_loss": 1.3999550342559814, + "eval_runtime": 52.9664, + "eval_samples_per_second": 56.64, + "eval_steps_per_second": 1.775, + "step": 8150 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001966243230253283, + "loss": 1.3866, + "step": 8160 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019661358272314995, + "loss": 1.3632, + "step": 8180 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019660284242097154, + "loss": 1.353, + "step": 8200 + }, + { + "epoch": 0.44, + "eval_loss": 1.4003568887710571, + "eval_runtime": 52.8325, + "eval_samples_per_second": 56.783, + "eval_steps_per_second": 1.779, + "step": 8200 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019659210211879312, + "loss": 1.3927, + "step": 8220 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019658136181661474, + "loss": 1.384, + "step": 8240 + }, + { + "epoch": 0.44, + "eval_loss": 1.4005845785140991, + "eval_runtime": 52.917, + "eval_samples_per_second": 56.693, + "eval_steps_per_second": 1.776, + "step": 8250 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019657062151443632, + "loss": 1.4014, + "step": 8260 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019655988121225794, + "loss": 1.3651, + "step": 8280 + }, + { + "epoch": 0.44, + "learning_rate": 0.00019654914091007952, + "loss": 1.3308, + "step": 8300 + }, + { + "epoch": 0.44, + "eval_loss": 1.4005910158157349, + "eval_runtime": 53.0074, + "eval_samples_per_second": 56.596, + "eval_steps_per_second": 1.773, + "step": 8300 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001965384006079011, + "loss": 1.312, + "step": 8320 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019652766030572272, + "loss": 1.3904, + "step": 8340 + }, + { + "epoch": 0.45, + "eval_loss": 1.4002220630645752, + "eval_runtime": 52.9847, + "eval_samples_per_second": 56.62, + "eval_steps_per_second": 1.774, + "step": 8350 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001965169200035443, + "loss": 1.3463, + "step": 8360 + }, + { + "epoch": 0.45, + "learning_rate": 0.00019650617970136592, + "loss": 1.3611, + "step": 8380 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001964954393991875, + "loss": 1.3388, + "step": 8400 + }, + { + "epoch": 0.45, + "eval_loss": 1.400258183479309, + "eval_runtime": 52.8963, + "eval_samples_per_second": 56.715, + "eval_steps_per_second": 1.777, + "step": 8400 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001964846990970091, + "loss": 1.3349, + "step": 8420 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001964739587948307, + "loss": 1.3354, + "step": 8440 + }, + { + "epoch": 0.45, + "eval_loss": 1.4004709720611572, + "eval_runtime": 52.9783, + "eval_samples_per_second": 56.627, + "eval_steps_per_second": 1.774, + "step": 8450 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001964632184926523, + "loss": 1.3476, + "step": 8460 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001964524781904739, + "loss": 1.3994, + "step": 8480 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001964417378882955, + "loss": 1.3211, + "step": 8500 + }, + { + "epoch": 0.45, + "eval_loss": 1.3999745845794678, + "eval_runtime": 52.9063, + "eval_samples_per_second": 56.704, + "eval_steps_per_second": 1.777, + "step": 8500 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019643099758611708, + "loss": 1.3556, + "step": 8520 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001964202572839387, + "loss": 1.3796, + "step": 8540 + }, + { + "epoch": 0.46, + "eval_loss": 1.4002516269683838, + "eval_runtime": 52.962, + "eval_samples_per_second": 56.644, + "eval_steps_per_second": 1.775, + "step": 8550 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019640951698176028, + "loss": 1.322, + "step": 8560 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001963987766795819, + "loss": 1.3595, + "step": 8580 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019638803637740348, + "loss": 1.3518, + "step": 8600 + }, + { + "epoch": 0.46, + "eval_loss": 1.4005961418151855, + "eval_runtime": 52.9027, + "eval_samples_per_second": 56.708, + "eval_steps_per_second": 1.777, + "step": 8600 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001963772960752251, + "loss": 1.3272, + "step": 8620 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019636655577304668, + "loss": 1.3803, + "step": 8640 + }, + { + "epoch": 0.46, + "eval_loss": 1.4010200500488281, + "eval_runtime": 52.9126, + "eval_samples_per_second": 56.697, + "eval_steps_per_second": 1.777, + "step": 8650 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019635581547086827, + "loss": 1.3242, + "step": 8660 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019634507516868988, + "loss": 1.3993, + "step": 8680 + }, + { + "epoch": 0.46, + "learning_rate": 0.00019633433486651147, + "loss": 1.3858, + "step": 8700 + }, + { + "epoch": 0.46, + "eval_loss": 1.4003942012786865, + "eval_runtime": 52.8889, + "eval_samples_per_second": 56.723, + "eval_steps_per_second": 1.777, + "step": 8700 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019632359456433308, + "loss": 1.365, + "step": 8720 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019631285426215467, + "loss": 1.3537, + "step": 8740 + }, + { + "epoch": 0.47, + "eval_loss": 1.3994464874267578, + "eval_runtime": 52.9395, + "eval_samples_per_second": 56.668, + "eval_steps_per_second": 1.776, + "step": 8750 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019630211395997625, + "loss": 1.3707, + "step": 8760 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019629137365779787, + "loss": 1.3361, + "step": 8780 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019628063335561945, + "loss": 1.3452, + "step": 8800 + }, + { + "epoch": 0.47, + "eval_loss": 1.3997398614883423, + "eval_runtime": 52.9165, + "eval_samples_per_second": 56.693, + "eval_steps_per_second": 1.776, + "step": 8800 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019626989305344107, + "loss": 1.331, + "step": 8820 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019625915275126268, + "loss": 1.3507, + "step": 8840 + }, + { + "epoch": 0.47, + "eval_loss": 1.4000722169876099, + "eval_runtime": 52.9443, + "eval_samples_per_second": 56.663, + "eval_steps_per_second": 1.775, + "step": 8850 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019624841244908427, + "loss": 1.335, + "step": 8860 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019623767214690588, + "loss": 1.3474, + "step": 8880 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019622693184472747, + "loss": 1.3362, + "step": 8900 + }, + { + "epoch": 0.48, + "eval_loss": 1.4004658460617065, + "eval_runtime": 52.942, + "eval_samples_per_second": 56.666, + "eval_steps_per_second": 1.776, + "step": 8900 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019621619154254905, + "loss": 1.3395, + "step": 8920 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019620545124037067, + "loss": 1.3722, + "step": 8940 + }, + { + "epoch": 0.48, + "eval_loss": 1.4000860452651978, + "eval_runtime": 52.927, + "eval_samples_per_second": 56.682, + "eval_steps_per_second": 1.776, + "step": 8950 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019619471093819225, + "loss": 1.3352, + "step": 8960 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019618397063601387, + "loss": 1.3897, + "step": 8980 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019617323033383545, + "loss": 1.3678, + "step": 9000 + }, + { + "epoch": 0.48, + "eval_loss": 1.3992573022842407, + "eval_runtime": 52.8848, + "eval_samples_per_second": 56.727, + "eval_steps_per_second": 1.777, + "step": 9000 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019616249003165704, + "loss": 1.326, + "step": 9020 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019615174972947865, + "loss": 1.3879, + "step": 9040 + }, + { + "epoch": 0.48, + "eval_loss": 1.4001773595809937, + "eval_runtime": 52.9471, + "eval_samples_per_second": 56.66, + "eval_steps_per_second": 1.775, + "step": 9050 + }, + { + "epoch": 0.48, + "learning_rate": 0.00019614100942730024, + "loss": 1.3758, + "step": 9060 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019613026912512185, + "loss": 1.3969, + "step": 9080 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019611952882294344, + "loss": 1.3307, + "step": 9100 + }, + { + "epoch": 0.49, + "eval_loss": 1.4005745649337769, + "eval_runtime": 52.8911, + "eval_samples_per_second": 56.72, + "eval_steps_per_second": 1.777, + "step": 9100 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019610878852076505, + "loss": 1.3549, + "step": 9120 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019609804821858664, + "loss": 1.3189, + "step": 9140 + }, + { + "epoch": 0.49, + "eval_loss": 1.4000535011291504, + "eval_runtime": 52.9048, + "eval_samples_per_second": 56.706, + "eval_steps_per_second": 1.777, + "step": 9150 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019608730791640823, + "loss": 1.4094, + "step": 9160 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019607656761422984, + "loss": 1.3461, + "step": 9180 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019606582731205143, + "loss": 1.3535, + "step": 9200 + }, + { + "epoch": 0.49, + "eval_loss": 1.399420142173767, + "eval_runtime": 52.9196, + "eval_samples_per_second": 56.69, + "eval_steps_per_second": 1.776, + "step": 9200 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019605508700987304, + "loss": 1.3322, + "step": 9220 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019604434670769463, + "loss": 1.3483, + "step": 9240 + }, + { + "epoch": 0.49, + "eval_loss": 1.3995360136032104, + "eval_runtime": 53.024, + "eval_samples_per_second": 56.578, + "eval_steps_per_second": 1.773, + "step": 9250 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001960336064055162, + "loss": 1.3472, + "step": 9260 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019602286610333783, + "loss": 1.3597, + "step": 9280 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001960121258011594, + "loss": 1.3716, + "step": 9300 + }, + { + "epoch": 0.5, + "eval_loss": 1.3998219966888428, + "eval_runtime": 52.9143, + "eval_samples_per_second": 56.695, + "eval_steps_per_second": 1.776, + "step": 9300 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019600138549898103, + "loss": 1.3394, + "step": 9320 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001959906451968026, + "loss": 1.3498, + "step": 9340 + }, + { + "epoch": 0.5, + "eval_loss": 1.3993160724639893, + "eval_runtime": 53.0146, + "eval_samples_per_second": 56.588, + "eval_steps_per_second": 1.773, + "step": 9350 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001959799048946242, + "loss": 1.3344, + "step": 9360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001959691645924458, + "loss": 1.352, + "step": 9380 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001959584242902674, + "loss": 1.3464, + "step": 9400 + }, + { + "epoch": 0.5, + "eval_loss": 1.400106430053711, + "eval_runtime": 52.8683, + "eval_samples_per_second": 56.745, + "eval_steps_per_second": 1.778, + "step": 9400 + }, + { + "epoch": 0.5, + "learning_rate": 0.000195947683988089, + "loss": 1.3915, + "step": 9420 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019593694368591063, + "loss": 1.3662, + "step": 9440 + }, + { + "epoch": 0.5, + "eval_loss": 1.3995168209075928, + "eval_runtime": 53.0041, + "eval_samples_per_second": 56.599, + "eval_steps_per_second": 1.773, + "step": 9450 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001959262033837322, + "loss": 1.3307, + "step": 9460 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019591546308155383, + "loss": 1.379, + "step": 9480 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001959047227793754, + "loss": 1.3326, + "step": 9500 + }, + { + "epoch": 0.51, + "eval_loss": 1.3995767831802368, + "eval_runtime": 52.9586, + "eval_samples_per_second": 56.648, + "eval_steps_per_second": 1.775, + "step": 9500 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019589451949230592, + "loss": 1.3196, + "step": 9520 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019588377919012754, + "loss": 1.4136, + "step": 9540 + }, + { + "epoch": 0.51, + "eval_loss": 1.399158000946045, + "eval_runtime": 52.9987, + "eval_samples_per_second": 56.605, + "eval_steps_per_second": 1.774, + "step": 9550 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019587303888794912, + "loss": 1.3556, + "step": 9560 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001958622985857707, + "loss": 1.3996, + "step": 9580 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019585155828359232, + "loss": 1.3511, + "step": 9600 + }, + { + "epoch": 0.51, + "eval_loss": 1.398504376411438, + "eval_runtime": 52.9812, + "eval_samples_per_second": 56.624, + "eval_steps_per_second": 1.774, + "step": 9600 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001958408179814139, + "loss": 1.3574, + "step": 9620 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019583007767923552, + "loss": 1.3567, + "step": 9640 + }, + { + "epoch": 0.52, + "eval_loss": 1.3994630575180054, + "eval_runtime": 52.8913, + "eval_samples_per_second": 56.72, + "eval_steps_per_second": 1.777, + "step": 9650 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001958193373770571, + "loss": 1.412, + "step": 9660 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001958085970748787, + "loss": 1.3513, + "step": 9680 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001957978567727003, + "loss": 1.3857, + "step": 9700 + }, + { + "epoch": 0.52, + "eval_loss": 1.3997857570648193, + "eval_runtime": 52.869, + "eval_samples_per_second": 56.744, + "eval_steps_per_second": 1.778, + "step": 9700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001957871164705219, + "loss": 1.3393, + "step": 9720 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001957763761683435, + "loss": 1.3286, + "step": 9740 + }, + { + "epoch": 0.52, + "eval_loss": 1.3988032341003418, + "eval_runtime": 52.9633, + "eval_samples_per_second": 56.643, + "eval_steps_per_second": 1.775, + "step": 9750 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001957656358661651, + "loss": 1.3608, + "step": 9760 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001957548955639867, + "loss": 1.3472, + "step": 9780 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001957441552618083, + "loss": 1.3982, + "step": 9800 + }, + { + "epoch": 0.52, + "eval_loss": 1.3989051580429077, + "eval_runtime": 52.9858, + "eval_samples_per_second": 56.619, + "eval_steps_per_second": 1.774, + "step": 9800 + }, + { + "epoch": 0.52, + "learning_rate": 0.00019573341495962988, + "loss": 1.3708, + "step": 9820 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001957226746574515, + "loss": 1.3517, + "step": 9840 + }, + { + "epoch": 0.53, + "eval_loss": 1.4000502824783325, + "eval_runtime": 52.9101, + "eval_samples_per_second": 56.7, + "eval_steps_per_second": 1.777, + "step": 9850 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019571193435527308, + "loss": 1.3621, + "step": 9860 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001957011940530947, + "loss": 1.3233, + "step": 9880 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019569045375091628, + "loss": 1.3952, + "step": 9900 + }, + { + "epoch": 0.53, + "eval_loss": 1.3988715410232544, + "eval_runtime": 52.8297, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 1.779, + "step": 9900 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001956797134487379, + "loss": 1.3501, + "step": 9920 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001956689731465595, + "loss": 1.3183, + "step": 9940 + }, + { + "epoch": 0.53, + "eval_loss": 1.397971510887146, + "eval_runtime": 52.9608, + "eval_samples_per_second": 56.646, + "eval_steps_per_second": 1.775, + "step": 9950 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001956582328443811, + "loss": 1.3749, + "step": 9960 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019564749254220268, + "loss": 1.3466, + "step": 9980 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001956367522400243, + "loss": 1.3783, + "step": 10000 + }, + { + "epoch": 0.53, + "eval_loss": 1.3991659879684448, + "eval_runtime": 52.9105, + "eval_samples_per_second": 56.7, + "eval_steps_per_second": 1.777, + "step": 10000 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019562601193784588, + "loss": 1.367, + "step": 10020 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001956152716356675, + "loss": 1.2971, + "step": 10040 + }, + { + "epoch": 0.54, + "eval_loss": 1.3988336324691772, + "eval_runtime": 52.9518, + "eval_samples_per_second": 56.655, + "eval_steps_per_second": 1.775, + "step": 10050 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019560453133348908, + "loss": 1.356, + "step": 10060 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019559379103131067, + "loss": 1.3696, + "step": 10080 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019558305072913228, + "loss": 1.3909, + "step": 10100 + }, + { + "epoch": 0.54, + "eval_loss": 1.3984498977661133, + "eval_runtime": 52.8606, + "eval_samples_per_second": 56.753, + "eval_steps_per_second": 1.778, + "step": 10100 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019557231042695387, + "loss": 1.3355, + "step": 10120 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019556157012477548, + "loss": 1.3134, + "step": 10140 + }, + { + "epoch": 0.54, + "eval_loss": 1.3991281986236572, + "eval_runtime": 52.9144, + "eval_samples_per_second": 56.695, + "eval_steps_per_second": 1.776, + "step": 10150 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019555082982259707, + "loss": 1.3037, + "step": 10160 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019554008952041868, + "loss": 1.3041, + "step": 10180 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019552934921824027, + "loss": 1.3363, + "step": 10200 + }, + { + "epoch": 0.55, + "eval_loss": 1.3987741470336914, + "eval_runtime": 52.9639, + "eval_samples_per_second": 56.642, + "eval_steps_per_second": 1.775, + "step": 10200 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019551860891606185, + "loss": 1.3846, + "step": 10220 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019550786861388347, + "loss": 1.3545, + "step": 10240 + }, + { + "epoch": 0.55, + "eval_loss": 1.3983027935028076, + "eval_runtime": 53.0334, + "eval_samples_per_second": 56.568, + "eval_steps_per_second": 1.772, + "step": 10250 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019549712831170505, + "loss": 1.3237, + "step": 10260 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019548638800952667, + "loss": 1.3866, + "step": 10280 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019547564770734825, + "loss": 1.3839, + "step": 10300 + }, + { + "epoch": 0.55, + "eval_loss": 1.3988779783248901, + "eval_runtime": 52.9716, + "eval_samples_per_second": 56.634, + "eval_steps_per_second": 1.775, + "step": 10300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019546490740516984, + "loss": 1.3103, + "step": 10320 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019545416710299145, + "loss": 1.3675, + "step": 10340 + }, + { + "epoch": 0.55, + "eval_loss": 1.3987020254135132, + "eval_runtime": 52.9442, + "eval_samples_per_second": 56.663, + "eval_steps_per_second": 1.775, + "step": 10350 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019544342680081304, + "loss": 1.3107, + "step": 10360 + }, + { + "epoch": 0.55, + "learning_rate": 0.00019543268649863465, + "loss": 1.3848, + "step": 10380 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019542194619645624, + "loss": 1.3692, + "step": 10400 + }, + { + "epoch": 0.56, + "eval_loss": 1.3982036113739014, + "eval_runtime": 52.9114, + "eval_samples_per_second": 56.699, + "eval_steps_per_second": 1.777, + "step": 10400 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019541120589427783, + "loss": 1.3652, + "step": 10420 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019540046559209944, + "loss": 1.3322, + "step": 10440 + }, + { + "epoch": 0.56, + "eval_loss": 1.398821234703064, + "eval_runtime": 52.9263, + "eval_samples_per_second": 56.683, + "eval_steps_per_second": 1.776, + "step": 10450 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019538972528992103, + "loss": 1.377, + "step": 10460 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019537898498774264, + "loss": 1.3895, + "step": 10480 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019536824468556423, + "loss": 1.3505, + "step": 10500 + }, + { + "epoch": 0.56, + "eval_loss": 1.398823857307434, + "eval_runtime": 52.9032, + "eval_samples_per_second": 56.707, + "eval_steps_per_second": 1.777, + "step": 10500 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001953575043833858, + "loss": 1.3608, + "step": 10520 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019534676408120745, + "loss": 1.391, + "step": 10540 + }, + { + "epoch": 0.56, + "eval_loss": 1.3982605934143066, + "eval_runtime": 52.9352, + "eval_samples_per_second": 56.673, + "eval_steps_per_second": 1.776, + "step": 10550 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019533602377902904, + "loss": 1.3712, + "step": 10560 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019532528347685063, + "loss": 1.3494, + "step": 10580 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019531454317467224, + "loss": 1.3404, + "step": 10600 + }, + { + "epoch": 0.57, + "eval_loss": 1.3984962701797485, + "eval_runtime": 53.1915, + "eval_samples_per_second": 56.4, + "eval_steps_per_second": 1.767, + "step": 10600 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019530380287249383, + "loss": 1.3554, + "step": 10620 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019529306257031544, + "loss": 1.3764, + "step": 10640 + }, + { + "epoch": 0.57, + "eval_loss": 1.398600697517395, + "eval_runtime": 53.4487, + "eval_samples_per_second": 56.129, + "eval_steps_per_second": 1.759, + "step": 10650 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019528232226813703, + "loss": 1.3314, + "step": 10660 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019527158196595864, + "loss": 1.3736, + "step": 10680 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019526084166378023, + "loss": 1.3859, + "step": 10700 + }, + { + "epoch": 0.57, + "eval_loss": 1.3986971378326416, + "eval_runtime": 53.387, + "eval_samples_per_second": 56.193, + "eval_steps_per_second": 1.761, + "step": 10700 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001952501013616018, + "loss": 1.3524, + "step": 10720 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019523936105942343, + "loss": 1.3373, + "step": 10740 + }, + { + "epoch": 0.57, + "eval_loss": 1.3985203504562378, + "eval_runtime": 53.4258, + "eval_samples_per_second": 56.153, + "eval_steps_per_second": 1.759, + "step": 10750 + }, + { + "epoch": 0.57, + "learning_rate": 0.000195228620757245, + "loss": 1.3828, + "step": 10760 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019521788045506663, + "loss": 1.2932, + "step": 10780 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001952071401528882, + "loss": 1.3847, + "step": 10800 + }, + { + "epoch": 0.58, + "eval_loss": 1.399185061454773, + "eval_runtime": 53.4654, + "eval_samples_per_second": 56.111, + "eval_steps_per_second": 1.758, + "step": 10800 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001951963998507098, + "loss": 1.3739, + "step": 10820 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001951856595485314, + "loss": 1.3553, + "step": 10840 + }, + { + "epoch": 0.58, + "eval_loss": 1.3980084657669067, + "eval_runtime": 53.45, + "eval_samples_per_second": 56.127, + "eval_steps_per_second": 1.759, + "step": 10850 + }, + { + "epoch": 0.58, + "learning_rate": 0.000195174919246353, + "loss": 1.4051, + "step": 10860 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001951641789441746, + "loss": 1.3456, + "step": 10880 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001951534386419962, + "loss": 1.3756, + "step": 10900 + }, + { + "epoch": 0.58, + "eval_loss": 1.3982112407684326, + "eval_runtime": 49.9421, + "eval_samples_per_second": 60.07, + "eval_steps_per_second": 1.882, + "step": 10900 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019514269833981779, + "loss": 1.3521, + "step": 10920 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001951319580376394, + "loss": 1.3745, + "step": 10940 + }, + { + "epoch": 0.59, + "eval_loss": 1.3991111516952515, + "eval_runtime": 49.9838, + "eval_samples_per_second": 60.019, + "eval_steps_per_second": 1.881, + "step": 10950 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019512121773546099, + "loss": 1.3661, + "step": 10960 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001951104774332826, + "loss": 1.3637, + "step": 10980 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019509973713110419, + "loss": 1.3263, + "step": 11000 + }, + { + "epoch": 0.59, + "eval_loss": 1.398601770401001, + "eval_runtime": 49.954, + "eval_samples_per_second": 60.055, + "eval_steps_per_second": 1.882, + "step": 11000 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019508899682892577, + "loss": 1.3062, + "step": 11020 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019507825652674739, + "loss": 1.3634, + "step": 11040 + }, + { + "epoch": 0.59, + "eval_loss": 1.3987818956375122, + "eval_runtime": 49.977, + "eval_samples_per_second": 60.028, + "eval_steps_per_second": 1.881, + "step": 11050 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019506751622456897, + "loss": 1.3272, + "step": 11060 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019505677592239059, + "loss": 1.369, + "step": 11080 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019504603562021217, + "loss": 1.3177, + "step": 11100 + }, + { + "epoch": 0.59, + "eval_loss": 1.3984121084213257, + "eval_runtime": 49.9403, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 11100 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019503529531803379, + "loss": 1.3869, + "step": 11120 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019502455501585537, + "loss": 1.3766, + "step": 11140 + }, + { + "epoch": 0.6, + "eval_loss": 1.39817476272583, + "eval_runtime": 49.9679, + "eval_samples_per_second": 60.038, + "eval_steps_per_second": 1.881, + "step": 11150 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019501381471367696, + "loss": 1.3801, + "step": 11160 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001950030744114986, + "loss": 1.3549, + "step": 11180 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019499233410932019, + "loss": 1.37, + "step": 11200 + }, + { + "epoch": 0.6, + "eval_loss": 1.3982213735580444, + "eval_runtime": 49.9229, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 11200 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019498159380714177, + "loss": 1.3597, + "step": 11220 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019497085350496339, + "loss": 1.3215, + "step": 11240 + }, + { + "epoch": 0.6, + "eval_loss": 1.3987040519714355, + "eval_runtime": 49.9503, + "eval_samples_per_second": 60.06, + "eval_steps_per_second": 1.882, + "step": 11250 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019496011320278497, + "loss": 1.3655, + "step": 11260 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019494937290060659, + "loss": 1.3059, + "step": 11280 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019493863259842817, + "loss": 1.3689, + "step": 11300 + }, + { + "epoch": 0.6, + "eval_loss": 1.397697925567627, + "eval_runtime": 49.916, + "eval_samples_per_second": 60.101, + "eval_steps_per_second": 1.883, + "step": 11300 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019492789229624976, + "loss": 1.3428, + "step": 11320 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019491715199407137, + "loss": 1.3463, + "step": 11340 + }, + { + "epoch": 0.61, + "eval_loss": 1.3985458612442017, + "eval_runtime": 49.9432, + "eval_samples_per_second": 60.068, + "eval_steps_per_second": 1.882, + "step": 11350 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019490641169189296, + "loss": 1.389, + "step": 11360 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019489567138971457, + "loss": 1.3329, + "step": 11380 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019488493108753616, + "loss": 1.3253, + "step": 11400 + }, + { + "epoch": 0.61, + "eval_loss": 1.3987817764282227, + "eval_runtime": 49.8579, + "eval_samples_per_second": 60.171, + "eval_steps_per_second": 1.885, + "step": 11400 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019487419078535775, + "loss": 1.347, + "step": 11420 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019486345048317936, + "loss": 1.3579, + "step": 11440 + }, + { + "epoch": 0.61, + "eval_loss": 1.3982493877410889, + "eval_runtime": 49.9401, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 11450 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019485271018100095, + "loss": 1.3539, + "step": 11460 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019484196987882256, + "loss": 1.3812, + "step": 11480 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019483122957664415, + "loss": 1.3314, + "step": 11500 + }, + { + "epoch": 0.61, + "eval_loss": 1.3974528312683105, + "eval_runtime": 49.8944, + "eval_samples_per_second": 60.127, + "eval_steps_per_second": 1.884, + "step": 11500 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019482048927446573, + "loss": 1.3545, + "step": 11520 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019480974897228735, + "loss": 1.3466, + "step": 11540 + }, + { + "epoch": 0.62, + "eval_loss": 1.3979511260986328, + "eval_runtime": 49.9421, + "eval_samples_per_second": 60.07, + "eval_steps_per_second": 1.882, + "step": 11550 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019479900867010893, + "loss": 1.3226, + "step": 11560 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019478826836793055, + "loss": 1.3634, + "step": 11580 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019477752806575213, + "loss": 1.3406, + "step": 11600 + }, + { + "epoch": 0.62, + "eval_loss": 1.3981132507324219, + "eval_runtime": 49.9056, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 11600 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019476678776357375, + "loss": 1.3684, + "step": 11620 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019475604746139533, + "loss": 1.3618, + "step": 11640 + }, + { + "epoch": 0.62, + "eval_loss": 1.3980711698532104, + "eval_runtime": 49.9349, + "eval_samples_per_second": 60.078, + "eval_steps_per_second": 1.882, + "step": 11650 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019474530715921692, + "loss": 1.365, + "step": 11660 + }, + { + "epoch": 0.62, + "learning_rate": 0.00019473456685703853, + "loss": 1.3526, + "step": 11680 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019472382655486012, + "loss": 1.3697, + "step": 11700 + }, + { + "epoch": 0.63, + "eval_loss": 1.3980944156646729, + "eval_runtime": 49.9354, + "eval_samples_per_second": 60.078, + "eval_steps_per_second": 1.882, + "step": 11700 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019471308625268173, + "loss": 1.3351, + "step": 11720 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019470234595050332, + "loss": 1.3683, + "step": 11740 + }, + { + "epoch": 0.63, + "eval_loss": 1.3976329565048218, + "eval_runtime": 49.9772, + "eval_samples_per_second": 60.027, + "eval_steps_per_second": 1.881, + "step": 11750 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001946916056483249, + "loss": 1.3428, + "step": 11760 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019468086534614652, + "loss": 1.3529, + "step": 11780 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019467066205907705, + "loss": 1.339, + "step": 11800 + }, + { + "epoch": 0.63, + "eval_loss": 1.3983910083770752, + "eval_runtime": 49.9014, + "eval_samples_per_second": 60.119, + "eval_steps_per_second": 1.884, + "step": 11800 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019465992175689864, + "loss": 1.3442, + "step": 11820 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019464918145472025, + "loss": 1.3504, + "step": 11840 + }, + { + "epoch": 0.63, + "eval_loss": 1.3987594842910767, + "eval_runtime": 49.9806, + "eval_samples_per_second": 60.023, + "eval_steps_per_second": 1.881, + "step": 11850 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019463844115254184, + "loss": 1.34, + "step": 11860 + }, + { + "epoch": 0.63, + "learning_rate": 0.00019462770085036343, + "loss": 1.3778, + "step": 11880 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019461696054818504, + "loss": 1.3813, + "step": 11900 + }, + { + "epoch": 0.64, + "eval_loss": 1.3981685638427734, + "eval_runtime": 49.9063, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 11900 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019460622024600663, + "loss": 1.3647, + "step": 11920 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019459547994382824, + "loss": 1.3227, + "step": 11940 + }, + { + "epoch": 0.64, + "eval_loss": 1.398089051246643, + "eval_runtime": 49.9227, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 11950 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019458473964164983, + "loss": 1.3295, + "step": 11960 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001945739993394714, + "loss": 1.3703, + "step": 11980 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019456325903729303, + "loss": 1.3861, + "step": 12000 + }, + { + "epoch": 0.64, + "eval_loss": 1.3978980779647827, + "eval_runtime": 49.9428, + "eval_samples_per_second": 60.069, + "eval_steps_per_second": 1.882, + "step": 12000 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001945525187351146, + "loss": 1.3193, + "step": 12020 + }, + { + "epoch": 0.64, + "learning_rate": 0.00019454177843293623, + "loss": 1.3136, + "step": 12040 + }, + { + "epoch": 0.64, + "eval_loss": 1.3983912467956543, + "eval_runtime": 49.9338, + "eval_samples_per_second": 60.079, + "eval_steps_per_second": 1.882, + "step": 12050 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001945310381307578, + "loss": 1.3612, + "step": 12060 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001945202978285794, + "loss": 1.3684, + "step": 12080 + }, + { + "epoch": 0.65, + "learning_rate": 0.000194509557526401, + "loss": 1.3534, + "step": 12100 + }, + { + "epoch": 0.65, + "eval_loss": 1.3977357149124146, + "eval_runtime": 49.9318, + "eval_samples_per_second": 60.082, + "eval_steps_per_second": 1.883, + "step": 12100 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001944988172242226, + "loss": 1.3366, + "step": 12120 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001944880769220442, + "loss": 1.3793, + "step": 12140 + }, + { + "epoch": 0.65, + "eval_loss": 1.3974858522415161, + "eval_runtime": 49.9338, + "eval_samples_per_second": 60.08, + "eval_steps_per_second": 1.882, + "step": 12150 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001944773366198658, + "loss": 1.3434, + "step": 12160 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019446659631768739, + "loss": 1.3663, + "step": 12180 + }, + { + "epoch": 0.65, + "learning_rate": 0.000194455856015509, + "loss": 1.3569, + "step": 12200 + }, + { + "epoch": 0.65, + "eval_loss": 1.3975430727005005, + "eval_runtime": 49.935, + "eval_samples_per_second": 60.078, + "eval_steps_per_second": 1.882, + "step": 12200 + }, + { + "epoch": 0.65, + "learning_rate": 0.00019444511571333059, + "loss": 1.3211, + "step": 12220 + }, + { + "epoch": 0.65, + "learning_rate": 0.0001944343754111522, + "loss": 1.3385, + "step": 12240 + }, + { + "epoch": 0.65, + "eval_loss": 1.397087574005127, + "eval_runtime": 49.9229, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 12250 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019442363510897379, + "loss": 1.3647, + "step": 12260 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001944128948067954, + "loss": 1.3593, + "step": 12280 + }, + { + "epoch": 0.66, + "learning_rate": 0.000194402154504617, + "loss": 1.315, + "step": 12300 + }, + { + "epoch": 0.66, + "eval_loss": 1.3981412649154663, + "eval_runtime": 49.8967, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 12300 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001943914142024386, + "loss": 1.3474, + "step": 12320 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001943806739002602, + "loss": 1.3415, + "step": 12340 + }, + { + "epoch": 0.66, + "eval_loss": 1.3985828161239624, + "eval_runtime": 49.9464, + "eval_samples_per_second": 60.064, + "eval_steps_per_second": 1.882, + "step": 12350 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001943699335980818, + "loss": 1.3482, + "step": 12360 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019435919329590339, + "loss": 1.3501, + "step": 12380 + }, + { + "epoch": 0.66, + "learning_rate": 0.000194348452993725, + "loss": 1.3659, + "step": 12400 + }, + { + "epoch": 0.66, + "eval_loss": 1.3977829217910767, + "eval_runtime": 49.9024, + "eval_samples_per_second": 60.117, + "eval_steps_per_second": 1.884, + "step": 12400 + }, + { + "epoch": 0.66, + "learning_rate": 0.00019433771269154659, + "loss": 1.3447, + "step": 12420 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001943269723893682, + "loss": 1.3557, + "step": 12440 + }, + { + "epoch": 0.67, + "eval_loss": 1.397894263267517, + "eval_runtime": 49.9359, + "eval_samples_per_second": 60.077, + "eval_steps_per_second": 1.882, + "step": 12450 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019431623208718979, + "loss": 1.3797, + "step": 12460 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019430549178501137, + "loss": 1.3659, + "step": 12480 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019429475148283299, + "loss": 1.3196, + "step": 12500 + }, + { + "epoch": 0.67, + "eval_loss": 1.3980075120925903, + "eval_runtime": 49.9211, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 12500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019428401118065457, + "loss": 1.3175, + "step": 12520 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019427327087847619, + "loss": 1.3826, + "step": 12540 + }, + { + "epoch": 0.67, + "eval_loss": 1.3978780508041382, + "eval_runtime": 49.9413, + "eval_samples_per_second": 60.07, + "eval_steps_per_second": 1.882, + "step": 12550 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019426253057629777, + "loss": 1.3643, + "step": 12560 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019425179027411936, + "loss": 1.35, + "step": 12580 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019424104997194097, + "loss": 1.4053, + "step": 12600 + }, + { + "epoch": 0.67, + "eval_loss": 1.398128867149353, + "eval_runtime": 49.914, + "eval_samples_per_second": 60.103, + "eval_steps_per_second": 1.883, + "step": 12600 + }, + { + "epoch": 0.67, + "learning_rate": 0.00019423030966976256, + "loss": 1.3571, + "step": 12620 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019421956936758417, + "loss": 1.3333, + "step": 12640 + }, + { + "epoch": 0.68, + "eval_loss": 1.3984968662261963, + "eval_runtime": 49.9223, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 12650 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019420882906540576, + "loss": 1.3146, + "step": 12660 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019419808876322737, + "loss": 1.3332, + "step": 12680 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019418734846104896, + "loss": 1.3395, + "step": 12700 + }, + { + "epoch": 0.68, + "eval_loss": 1.3976988792419434, + "eval_runtime": 49.8906, + "eval_samples_per_second": 60.132, + "eval_steps_per_second": 1.884, + "step": 12700 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019417714517397947, + "loss": 1.3311, + "step": 12720 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019416640487180105, + "loss": 1.3763, + "step": 12740 + }, + { + "epoch": 0.68, + "eval_loss": 1.3981629610061646, + "eval_runtime": 49.9166, + "eval_samples_per_second": 60.1, + "eval_steps_per_second": 1.883, + "step": 12750 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001941556645696227, + "loss": 1.3319, + "step": 12760 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019414492426744428, + "loss": 1.3113, + "step": 12780 + }, + { + "epoch": 0.68, + "learning_rate": 0.0001941341839652659, + "loss": 1.3508, + "step": 12800 + }, + { + "epoch": 0.68, + "eval_loss": 1.3979384899139404, + "eval_runtime": 49.8894, + "eval_samples_per_second": 60.133, + "eval_steps_per_second": 1.884, + "step": 12800 + }, + { + "epoch": 0.68, + "learning_rate": 0.00019412344366308748, + "loss": 1.3468, + "step": 12820 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019411270336090907, + "loss": 1.3509, + "step": 12840 + }, + { + "epoch": 0.69, + "eval_loss": 1.3976484537124634, + "eval_runtime": 49.9245, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 12850 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019410196305873068, + "loss": 1.3474, + "step": 12860 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019409122275655227, + "loss": 1.3796, + "step": 12880 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019408048245437388, + "loss": 1.362, + "step": 12900 + }, + { + "epoch": 0.69, + "eval_loss": 1.3983675241470337, + "eval_runtime": 49.9089, + "eval_samples_per_second": 60.109, + "eval_steps_per_second": 1.883, + "step": 12900 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019406974215219547, + "loss": 1.3926, + "step": 12920 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019405900185001705, + "loss": 1.3812, + "step": 12940 + }, + { + "epoch": 0.69, + "eval_loss": 1.3985722064971924, + "eval_runtime": 49.9385, + "eval_samples_per_second": 60.074, + "eval_steps_per_second": 1.882, + "step": 12950 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019404826154783867, + "loss": 1.3442, + "step": 12960 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019403752124566025, + "loss": 1.3183, + "step": 12980 + }, + { + "epoch": 0.69, + "learning_rate": 0.00019402678094348187, + "loss": 1.3254, + "step": 13000 + }, + { + "epoch": 0.69, + "eval_loss": 1.3993173837661743, + "eval_runtime": 49.8843, + "eval_samples_per_second": 60.139, + "eval_steps_per_second": 1.884, + "step": 13000 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019401604064130345, + "loss": 1.3607, + "step": 13020 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019400530033912504, + "loss": 1.3324, + "step": 13040 + }, + { + "epoch": 0.7, + "eval_loss": 1.398432970046997, + "eval_runtime": 49.9241, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 13050 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019399456003694665, + "loss": 1.3898, + "step": 13060 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019398381973476824, + "loss": 1.337, + "step": 13080 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019397307943258985, + "loss": 1.3527, + "step": 13100 + }, + { + "epoch": 0.7, + "eval_loss": 1.3988566398620605, + "eval_runtime": 49.8968, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 13100 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019396233913041144, + "loss": 1.3584, + "step": 13120 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019395159882823303, + "loss": 1.3111, + "step": 13140 + }, + { + "epoch": 0.7, + "eval_loss": 1.3984296321868896, + "eval_runtime": 49.9247, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 13150 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019394085852605464, + "loss": 1.3707, + "step": 13160 + }, + { + "epoch": 0.7, + "learning_rate": 0.00019393011822387623, + "loss": 1.3172, + "step": 13180 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019391937792169784, + "loss": 1.3356, + "step": 13200 + }, + { + "epoch": 0.71, + "eval_loss": 1.398664116859436, + "eval_runtime": 49.933, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 13200 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019390863761951943, + "loss": 1.3222, + "step": 13220 + }, + { + "epoch": 0.71, + "learning_rate": 0.000193897897317341, + "loss": 1.3346, + "step": 13240 + }, + { + "epoch": 0.71, + "eval_loss": 1.3981634378433228, + "eval_runtime": 49.9997, + "eval_samples_per_second": 60.0, + "eval_steps_per_second": 1.88, + "step": 13250 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019388715701516263, + "loss": 1.3066, + "step": 13260 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001938764167129842, + "loss": 1.3333, + "step": 13280 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019386567641080583, + "loss": 1.324, + "step": 13300 + }, + { + "epoch": 0.71, + "eval_loss": 1.3983701467514038, + "eval_runtime": 49.8793, + "eval_samples_per_second": 60.145, + "eval_steps_per_second": 1.885, + "step": 13300 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001938549361086274, + "loss": 1.3482, + "step": 13320 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019384419580644903, + "loss": 1.3449, + "step": 13340 + }, + { + "epoch": 0.71, + "eval_loss": 1.3981118202209473, + "eval_runtime": 49.9209, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 13350 + }, + { + "epoch": 0.71, + "learning_rate": 0.0001938334555042706, + "loss": 1.3021, + "step": 13360 + }, + { + "epoch": 0.71, + "learning_rate": 0.00019382271520209223, + "loss": 1.3314, + "step": 13380 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019381197489991384, + "loss": 1.3431, + "step": 13400 + }, + { + "epoch": 0.72, + "eval_loss": 1.397736668586731, + "eval_runtime": 49.9221, + "eval_samples_per_second": 60.094, + "eval_steps_per_second": 1.883, + "step": 13400 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019380123459773543, + "loss": 1.3641, + "step": 13420 + }, + { + "epoch": 0.72, + "learning_rate": 0.000193790494295557, + "loss": 1.3978, + "step": 13440 + }, + { + "epoch": 0.72, + "eval_loss": 1.3974454402923584, + "eval_runtime": 49.9612, + "eval_samples_per_second": 60.047, + "eval_steps_per_second": 1.881, + "step": 13450 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019377975399337863, + "loss": 1.3549, + "step": 13460 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001937690136912002, + "loss": 1.3348, + "step": 13480 + }, + { + "epoch": 0.72, + "learning_rate": 0.00019375827338902183, + "loss": 1.3524, + "step": 13500 + }, + { + "epoch": 0.72, + "eval_loss": 1.3968658447265625, + "eval_runtime": 49.8947, + "eval_samples_per_second": 60.127, + "eval_steps_per_second": 1.884, + "step": 13500 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001937475330868434, + "loss": 1.3652, + "step": 13520 + }, + { + "epoch": 0.72, + "learning_rate": 0.000193736792784665, + "loss": 1.313, + "step": 13540 + }, + { + "epoch": 0.72, + "eval_loss": 1.3979847431182861, + "eval_runtime": 49.9397, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 13550 + }, + { + "epoch": 0.72, + "learning_rate": 0.0001937260524824866, + "loss": 1.3404, + "step": 13560 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001937153121803082, + "loss": 1.356, + "step": 13580 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001937045718781298, + "loss": 1.3845, + "step": 13600 + }, + { + "epoch": 0.73, + "eval_loss": 1.398152470588684, + "eval_runtime": 49.908, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 13600 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001936938315759514, + "loss": 1.3063, + "step": 13620 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019368309127377299, + "loss": 1.3415, + "step": 13640 + }, + { + "epoch": 0.73, + "eval_loss": 1.397993803024292, + "eval_runtime": 49.8822, + "eval_samples_per_second": 60.142, + "eval_steps_per_second": 1.884, + "step": 13650 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001936723509715946, + "loss": 1.3344, + "step": 13660 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019366161066941619, + "loss": 1.3579, + "step": 13680 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001936508703672378, + "loss": 1.3523, + "step": 13700 + }, + { + "epoch": 0.73, + "eval_loss": 1.3971080780029297, + "eval_runtime": 49.8677, + "eval_samples_per_second": 60.159, + "eval_steps_per_second": 1.885, + "step": 13700 + }, + { + "epoch": 0.73, + "learning_rate": 0.00019364013006505939, + "loss": 1.3381, + "step": 13720 + }, + { + "epoch": 0.73, + "learning_rate": 0.000193629389762881, + "loss": 1.3221, + "step": 13740 + }, + { + "epoch": 0.73, + "eval_loss": 1.3974502086639404, + "eval_runtime": 49.9344, + "eval_samples_per_second": 60.079, + "eval_steps_per_second": 1.882, + "step": 13750 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019361864946070259, + "loss": 1.3752, + "step": 13760 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019360790915852417, + "loss": 1.3802, + "step": 13780 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019359716885634579, + "loss": 1.3549, + "step": 13800 + }, + { + "epoch": 0.74, + "eval_loss": 1.397215485572815, + "eval_runtime": 49.8832, + "eval_samples_per_second": 60.141, + "eval_steps_per_second": 1.884, + "step": 13800 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019358642855416737, + "loss": 1.3276, + "step": 13820 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019357568825198899, + "loss": 1.3705, + "step": 13840 + }, + { + "epoch": 0.74, + "eval_loss": 1.3972536325454712, + "eval_runtime": 49.9311, + "eval_samples_per_second": 60.083, + "eval_steps_per_second": 1.883, + "step": 13850 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019356494794981057, + "loss": 1.3858, + "step": 13860 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019355420764763216, + "loss": 1.3432, + "step": 13880 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019354346734545377, + "loss": 1.3265, + "step": 13900 + }, + { + "epoch": 0.74, + "eval_loss": 1.3972595930099487, + "eval_runtime": 49.9209, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 13900 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019353272704327536, + "loss": 1.3146, + "step": 13920 + }, + { + "epoch": 0.74, + "learning_rate": 0.00019352198674109697, + "loss": 1.3571, + "step": 13940 + }, + { + "epoch": 0.75, + "eval_loss": 1.3972257375717163, + "eval_runtime": 49.9372, + "eval_samples_per_second": 60.075, + "eval_steps_per_second": 1.882, + "step": 13950 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019351124643891856, + "loss": 1.3238, + "step": 13960 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019350050613674015, + "loss": 1.3853, + "step": 13980 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019348976583456176, + "loss": 1.3557, + "step": 14000 + }, + { + "epoch": 0.75, + "eval_loss": 1.3965345621109009, + "eval_runtime": 49.9404, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 14000 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019347902553238337, + "loss": 1.3851, + "step": 14020 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019346828523020496, + "loss": 1.3795, + "step": 14040 + }, + { + "epoch": 0.75, + "eval_loss": 1.396411418914795, + "eval_runtime": 49.9389, + "eval_samples_per_second": 60.073, + "eval_steps_per_second": 1.882, + "step": 14050 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019345754492802657, + "loss": 1.3692, + "step": 14060 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019344680462584816, + "loss": 1.3435, + "step": 14080 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019343606432366977, + "loss": 1.3379, + "step": 14100 + }, + { + "epoch": 0.75, + "eval_loss": 1.396824836730957, + "eval_runtime": 49.913, + "eval_samples_per_second": 60.105, + "eval_steps_per_second": 1.883, + "step": 14100 + }, + { + "epoch": 0.75, + "learning_rate": 0.00019342532402149136, + "loss": 1.3492, + "step": 14120 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019341458371931295, + "loss": 1.3871, + "step": 14140 + }, + { + "epoch": 0.76, + "eval_loss": 1.397158145904541, + "eval_runtime": 49.9329, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 14150 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019340384341713456, + "loss": 1.3377, + "step": 14160 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019339310311495615, + "loss": 1.3564, + "step": 14180 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019338236281277776, + "loss": 1.3323, + "step": 14200 + }, + { + "epoch": 0.76, + "eval_loss": 1.3961149454116821, + "eval_runtime": 49.8787, + "eval_samples_per_second": 60.146, + "eval_steps_per_second": 1.885, + "step": 14200 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019337162251059935, + "loss": 1.3267, + "step": 14220 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019336088220842096, + "loss": 1.3704, + "step": 14240 + }, + { + "epoch": 0.76, + "eval_loss": 1.3969779014587402, + "eval_runtime": 49.9253, + "eval_samples_per_second": 60.09, + "eval_steps_per_second": 1.883, + "step": 14250 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019335014190624255, + "loss": 1.3075, + "step": 14260 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019333940160406413, + "loss": 1.3306, + "step": 14280 + }, + { + "epoch": 0.76, + "learning_rate": 0.00019332866130188575, + "loss": 1.3715, + "step": 14300 + }, + { + "epoch": 0.76, + "eval_loss": 1.398051381111145, + "eval_runtime": 49.9213, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 14300 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019331792099970733, + "loss": 1.3683, + "step": 14320 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019330718069752895, + "loss": 1.3532, + "step": 14340 + }, + { + "epoch": 0.77, + "eval_loss": 1.3968747854232788, + "eval_runtime": 49.8917, + "eval_samples_per_second": 60.13, + "eval_steps_per_second": 1.884, + "step": 14350 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019329644039535053, + "loss": 1.3221, + "step": 14360 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019328570009317212, + "loss": 1.3417, + "step": 14380 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019327495979099373, + "loss": 1.3569, + "step": 14400 + }, + { + "epoch": 0.77, + "eval_loss": 1.3969061374664307, + "eval_runtime": 49.9229, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 14400 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019326421948881532, + "loss": 1.3145, + "step": 14420 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019325347918663693, + "loss": 1.3658, + "step": 14440 + }, + { + "epoch": 0.77, + "eval_loss": 1.3966419696807861, + "eval_runtime": 49.9352, + "eval_samples_per_second": 60.078, + "eval_steps_per_second": 1.882, + "step": 14450 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019324273888445852, + "loss": 1.3112, + "step": 14460 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001932319985822801, + "loss": 1.3663, + "step": 14480 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019322125828010172, + "loss": 1.344, + "step": 14500 + }, + { + "epoch": 0.77, + "eval_loss": 1.3972774744033813, + "eval_runtime": 49.8941, + "eval_samples_per_second": 60.127, + "eval_steps_per_second": 1.884, + "step": 14500 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001932105179779233, + "loss": 1.4069, + "step": 14520 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019319977767574492, + "loss": 1.317, + "step": 14540 + }, + { + "epoch": 0.78, + "eval_loss": 1.3966619968414307, + "eval_runtime": 49.9177, + "eval_samples_per_second": 60.099, + "eval_steps_per_second": 1.883, + "step": 14550 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001931890373735665, + "loss": 1.3299, + "step": 14560 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001931782970713881, + "loss": 1.3449, + "step": 14580 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001931675567692097, + "loss": 1.3442, + "step": 14600 + }, + { + "epoch": 0.78, + "eval_loss": 1.3963496685028076, + "eval_runtime": 49.871, + "eval_samples_per_second": 60.155, + "eval_steps_per_second": 1.885, + "step": 14600 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001931568164670313, + "loss": 1.323, + "step": 14620 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019314607616485293, + "loss": 1.2905, + "step": 14640 + }, + { + "epoch": 0.78, + "eval_loss": 1.3969212770462036, + "eval_runtime": 49.9598, + "eval_samples_per_second": 60.048, + "eval_steps_per_second": 1.882, + "step": 14650 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019313533586267452, + "loss": 1.3545, + "step": 14660 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001931245955604961, + "loss": 1.3491, + "step": 14680 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019311385525831772, + "loss": 1.3044, + "step": 14700 + }, + { + "epoch": 0.79, + "eval_loss": 1.396906852722168, + "eval_runtime": 49.8743, + "eval_samples_per_second": 60.151, + "eval_steps_per_second": 1.885, + "step": 14700 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001931031149561393, + "loss": 1.3529, + "step": 14720 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019309237465396092, + "loss": 1.357, + "step": 14740 + }, + { + "epoch": 0.79, + "eval_loss": 1.3965973854064941, + "eval_runtime": 49.9265, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 14750 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001930816343517825, + "loss": 1.3667, + "step": 14760 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001930708940496041, + "loss": 1.387, + "step": 14780 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001930601537474257, + "loss": 1.3874, + "step": 14800 + }, + { + "epoch": 0.79, + "eval_loss": 1.3979121446609497, + "eval_runtime": 49.9282, + "eval_samples_per_second": 60.086, + "eval_steps_per_second": 1.883, + "step": 14800 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001930494134452473, + "loss": 1.3496, + "step": 14820 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001930386731430689, + "loss": 1.368, + "step": 14840 + }, + { + "epoch": 0.79, + "eval_loss": 1.3969990015029907, + "eval_runtime": 49.921, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 14850 + }, + { + "epoch": 0.79, + "learning_rate": 0.0001930279328408905, + "loss": 1.3156, + "step": 14860 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019301719253871208, + "loss": 1.3375, + "step": 14880 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001930064522365337, + "loss": 1.3167, + "step": 14900 + }, + { + "epoch": 0.8, + "eval_loss": 1.3968135118484497, + "eval_runtime": 49.8772, + "eval_samples_per_second": 60.148, + "eval_steps_per_second": 1.885, + "step": 14900 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019299571193435528, + "loss": 1.3571, + "step": 14920 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001929849716321769, + "loss": 1.3635, + "step": 14940 + }, + { + "epoch": 0.8, + "eval_loss": 1.3962677717208862, + "eval_runtime": 49.9213, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 14950 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019297423132999848, + "loss": 1.3213, + "step": 14960 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019296349102782006, + "loss": 1.3479, + "step": 14980 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019295275072564168, + "loss": 1.3337, + "step": 15000 + }, + { + "epoch": 0.8, + "eval_loss": 1.3972392082214355, + "eval_runtime": 49.9202, + "eval_samples_per_second": 60.096, + "eval_steps_per_second": 1.883, + "step": 15000 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019294201042346326, + "loss": 1.3587, + "step": 15020 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019293127012128488, + "loss": 1.3067, + "step": 15040 + }, + { + "epoch": 0.8, + "eval_loss": 1.3960102796554565, + "eval_runtime": 49.9235, + "eval_samples_per_second": 60.092, + "eval_steps_per_second": 1.883, + "step": 15050 + }, + { + "epoch": 0.8, + "learning_rate": 0.00019292052981910646, + "loss": 1.3598, + "step": 15060 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019290978951692805, + "loss": 1.3731, + "step": 15080 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019289904921474966, + "loss": 1.3441, + "step": 15100 + }, + { + "epoch": 0.81, + "eval_loss": 1.3960813283920288, + "eval_runtime": 49.8955, + "eval_samples_per_second": 60.126, + "eval_steps_per_second": 1.884, + "step": 15100 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019288830891257125, + "loss": 1.4031, + "step": 15120 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019287756861039286, + "loss": 1.3091, + "step": 15140 + }, + { + "epoch": 0.81, + "eval_loss": 1.3964964151382446, + "eval_runtime": 49.9392, + "eval_samples_per_second": 60.073, + "eval_steps_per_second": 1.882, + "step": 15150 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019286682830821445, + "loss": 1.3879, + "step": 15160 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019285608800603606, + "loss": 1.2988, + "step": 15180 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019284534770385765, + "loss": 1.3283, + "step": 15200 + }, + { + "epoch": 0.81, + "eval_loss": 1.3965301513671875, + "eval_runtime": 49.8964, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 15200 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019283460740167924, + "loss": 1.296, + "step": 15220 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019282386709950085, + "loss": 1.3221, + "step": 15240 + }, + { + "epoch": 0.81, + "eval_loss": 1.396709680557251, + "eval_runtime": 49.9295, + "eval_samples_per_second": 60.085, + "eval_steps_per_second": 1.883, + "step": 15250 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019281312679732244, + "loss": 1.3727, + "step": 15260 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019280238649514405, + "loss": 1.3346, + "step": 15280 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019279164619296566, + "loss": 1.3356, + "step": 15300 + }, + { + "epoch": 0.82, + "eval_loss": 1.3975436687469482, + "eval_runtime": 49.8854, + "eval_samples_per_second": 60.138, + "eval_steps_per_second": 1.884, + "step": 15300 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019278090589078725, + "loss": 1.3798, + "step": 15320 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019277016558860886, + "loss": 1.3886, + "step": 15340 + }, + { + "epoch": 0.82, + "eval_loss": 1.3962510824203491, + "eval_runtime": 49.9207, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 15350 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019275942528643045, + "loss": 1.3468, + "step": 15360 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019274868498425204, + "loss": 1.3112, + "step": 15380 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019273794468207365, + "loss": 1.3781, + "step": 15400 + }, + { + "epoch": 0.82, + "eval_loss": 1.396148920059204, + "eval_runtime": 49.8966, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 15400 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019272720437989524, + "loss": 1.3551, + "step": 15420 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019271646407771685, + "loss": 1.3576, + "step": 15440 + }, + { + "epoch": 0.83, + "eval_loss": 1.3962191343307495, + "eval_runtime": 49.9264, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 15450 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019270572377553844, + "loss": 1.3325, + "step": 15460 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019269498347336002, + "loss": 1.3269, + "step": 15480 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019268424317118164, + "loss": 1.3609, + "step": 15500 + }, + { + "epoch": 0.83, + "eval_loss": 1.396767258644104, + "eval_runtime": 49.8856, + "eval_samples_per_second": 60.138, + "eval_steps_per_second": 1.884, + "step": 15500 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019267350286900322, + "loss": 1.3357, + "step": 15520 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019266276256682484, + "loss": 1.3444, + "step": 15540 + }, + { + "epoch": 0.83, + "eval_loss": 1.3966103792190552, + "eval_runtime": 49.9428, + "eval_samples_per_second": 60.069, + "eval_steps_per_second": 1.882, + "step": 15550 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019265202226464642, + "loss": 1.3193, + "step": 15560 + }, + { + "epoch": 0.83, + "learning_rate": 0.000192641281962468, + "loss": 1.3816, + "step": 15580 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019263054166028962, + "loss": 1.3225, + "step": 15600 + }, + { + "epoch": 0.83, + "eval_loss": 1.3972910642623901, + "eval_runtime": 49.882, + "eval_samples_per_second": 60.142, + "eval_steps_per_second": 1.884, + "step": 15600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001926198013581112, + "loss": 1.3317, + "step": 15620 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019260906105593282, + "loss": 1.3477, + "step": 15640 + }, + { + "epoch": 0.84, + "eval_loss": 1.3958677053451538, + "eval_runtime": 49.9085, + "eval_samples_per_second": 60.11, + "eval_steps_per_second": 1.883, + "step": 15650 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001925983207537544, + "loss": 1.3987, + "step": 15660 + }, + { + "epoch": 0.84, + "learning_rate": 0.00019258758045157602, + "loss": 1.3268, + "step": 15680 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001925768401493976, + "loss": 1.3587, + "step": 15700 + }, + { + "epoch": 0.84, + "eval_loss": 1.3959128856658936, + "eval_runtime": 49.9226, + "eval_samples_per_second": 60.093, + "eval_steps_per_second": 1.883, + "step": 15700 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001925660998472192, + "loss": 1.3475, + "step": 15720 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001925553595450408, + "loss": 1.3201, + "step": 15740 + }, + { + "epoch": 0.84, + "eval_loss": 1.3962161540985107, + "eval_runtime": 49.972, + "eval_samples_per_second": 60.034, + "eval_steps_per_second": 1.881, + "step": 15750 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001925446192428624, + "loss": 1.3714, + "step": 15760 + }, + { + "epoch": 0.84, + "learning_rate": 0.000192533878940684, + "loss": 1.3036, + "step": 15780 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001925231386385056, + "loss": 1.3653, + "step": 15800 + }, + { + "epoch": 0.84, + "eval_loss": 1.395885705947876, + "eval_runtime": 49.9021, + "eval_samples_per_second": 60.118, + "eval_steps_per_second": 1.884, + "step": 15800 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019251239833632718, + "loss": 1.3571, + "step": 15820 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001925016580341488, + "loss": 1.3387, + "step": 15840 + }, + { + "epoch": 0.85, + "eval_loss": 1.3962838649749756, + "eval_runtime": 49.9295, + "eval_samples_per_second": 60.085, + "eval_steps_per_second": 1.883, + "step": 15850 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019249091773197038, + "loss": 1.3645, + "step": 15860 + }, + { + "epoch": 0.85, + "learning_rate": 0.000192480177429792, + "loss": 1.3541, + "step": 15880 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001924694371276136, + "loss": 1.3139, + "step": 15900 + }, + { + "epoch": 0.85, + "eval_loss": 1.3968653678894043, + "eval_runtime": 49.9077, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 15900 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001924586968254352, + "loss": 1.4122, + "step": 15920 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001924479565232568, + "loss": 1.3177, + "step": 15940 + }, + { + "epoch": 0.85, + "eval_loss": 1.3964040279388428, + "eval_runtime": 49.9666, + "eval_samples_per_second": 60.04, + "eval_steps_per_second": 1.881, + "step": 15950 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001924372162210784, + "loss": 1.3521, + "step": 15960 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019242647591889998, + "loss": 1.3372, + "step": 15980 + }, + { + "epoch": 0.85, + "learning_rate": 0.0001924157356167216, + "loss": 1.3797, + "step": 16000 + }, + { + "epoch": 0.85, + "eval_loss": 1.3961364030838013, + "eval_runtime": 49.8963, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 16000 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019240499531454318, + "loss": 1.339, + "step": 16020 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001923942550123648, + "loss": 1.3424, + "step": 16040 + }, + { + "epoch": 0.86, + "eval_loss": 1.3959347009658813, + "eval_runtime": 49.9267, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 16050 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019238351471018638, + "loss": 1.3939, + "step": 16060 + }, + { + "epoch": 0.86, + "learning_rate": 0.000192372774408008, + "loss": 1.2875, + "step": 16080 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019236203410582958, + "loss": 1.3484, + "step": 16100 + }, + { + "epoch": 0.86, + "eval_loss": 1.3949804306030273, + "eval_runtime": 49.8842, + "eval_samples_per_second": 60.139, + "eval_steps_per_second": 1.884, + "step": 16100 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019235129380365117, + "loss": 1.3581, + "step": 16120 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019234055350147278, + "loss": 1.3377, + "step": 16140 + }, + { + "epoch": 0.86, + "eval_loss": 1.3949840068817139, + "eval_runtime": 49.896, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 16150 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019232981319929437, + "loss": 1.3687, + "step": 16160 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019231907289711598, + "loss": 1.3652, + "step": 16180 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019230833259493757, + "loss": 1.354, + "step": 16200 + }, + { + "epoch": 0.87, + "eval_loss": 1.394178867340088, + "eval_runtime": 49.8937, + "eval_samples_per_second": 60.128, + "eval_steps_per_second": 1.884, + "step": 16200 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019229759229275916, + "loss": 1.3611, + "step": 16220 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019228685199058077, + "loss": 1.3312, + "step": 16240 + }, + { + "epoch": 0.87, + "eval_loss": 1.395208716392517, + "eval_runtime": 49.9448, + "eval_samples_per_second": 60.066, + "eval_steps_per_second": 1.882, + "step": 16250 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019227611168840236, + "loss": 1.3424, + "step": 16260 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019226537138622397, + "loss": 1.3186, + "step": 16280 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019225463108404556, + "loss": 1.2937, + "step": 16300 + }, + { + "epoch": 0.87, + "eval_loss": 1.3952096700668335, + "eval_runtime": 49.8883, + "eval_samples_per_second": 60.134, + "eval_steps_per_second": 1.884, + "step": 16300 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019224389078186714, + "loss": 1.3298, + "step": 16320 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019223315047968876, + "loss": 1.3007, + "step": 16340 + }, + { + "epoch": 0.87, + "eval_loss": 1.3952969312667847, + "eval_runtime": 49.9449, + "eval_samples_per_second": 60.066, + "eval_steps_per_second": 1.882, + "step": 16350 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019222241017751034, + "loss": 1.3539, + "step": 16360 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019221166987533196, + "loss": 1.3331, + "step": 16380 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019220092957315354, + "loss": 1.3799, + "step": 16400 + }, + { + "epoch": 0.88, + "eval_loss": 1.3953661918640137, + "eval_runtime": 49.9059, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 16400 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019219018927097513, + "loss": 1.3345, + "step": 16420 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019217944896879674, + "loss": 1.349, + "step": 16440 + }, + { + "epoch": 0.88, + "eval_loss": 1.3948606252670288, + "eval_runtime": 49.9298, + "eval_samples_per_second": 60.084, + "eval_steps_per_second": 1.883, + "step": 16450 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019216870866661833, + "loss": 1.32, + "step": 16460 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019215796836443994, + "loss": 1.3409, + "step": 16480 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019214722806226153, + "loss": 1.3658, + "step": 16500 + }, + { + "epoch": 0.88, + "eval_loss": 1.3949049711227417, + "eval_runtime": 49.8722, + "eval_samples_per_second": 60.154, + "eval_steps_per_second": 1.885, + "step": 16500 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019213648776008311, + "loss": 1.3116, + "step": 16520 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019212574745790476, + "loss": 1.3559, + "step": 16540 + }, + { + "epoch": 0.88, + "eval_loss": 1.394840121269226, + "eval_runtime": 49.9341, + "eval_samples_per_second": 60.079, + "eval_steps_per_second": 1.882, + "step": 16550 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019211500715572634, + "loss": 1.3223, + "step": 16560 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019210426685354796, + "loss": 1.3238, + "step": 16580 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019209352655136954, + "loss": 1.3266, + "step": 16600 + }, + { + "epoch": 0.89, + "eval_loss": 1.3949869871139526, + "eval_runtime": 49.894, + "eval_samples_per_second": 60.127, + "eval_steps_per_second": 1.884, + "step": 16600 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019208278624919113, + "loss": 1.3232, + "step": 16620 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019207204594701274, + "loss": 1.3006, + "step": 16640 + }, + { + "epoch": 0.89, + "eval_loss": 1.3941547870635986, + "eval_runtime": 49.9595, + "eval_samples_per_second": 60.049, + "eval_steps_per_second": 1.882, + "step": 16650 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019206130564483433, + "loss": 1.3501, + "step": 16660 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019205056534265594, + "loss": 1.3213, + "step": 16680 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019203982504047753, + "loss": 1.3421, + "step": 16700 + }, + { + "epoch": 0.89, + "eval_loss": 1.394352674484253, + "eval_runtime": 49.9085, + "eval_samples_per_second": 60.11, + "eval_steps_per_second": 1.883, + "step": 16700 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019202908473829911, + "loss": 1.3582, + "step": 16720 + }, + { + "epoch": 0.89, + "learning_rate": 0.00019201834443612073, + "loss": 1.3153, + "step": 16740 + }, + { + "epoch": 0.89, + "eval_loss": 1.394830584526062, + "eval_runtime": 49.8738, + "eval_samples_per_second": 60.152, + "eval_steps_per_second": 1.885, + "step": 16750 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019200760413394231, + "loss": 1.2934, + "step": 16760 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019199686383176393, + "loss": 1.3533, + "step": 16780 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019198612352958551, + "loss": 1.3527, + "step": 16800 + }, + { + "epoch": 0.9, + "eval_loss": 1.3951090574264526, + "eval_runtime": 49.8917, + "eval_samples_per_second": 60.13, + "eval_steps_per_second": 1.884, + "step": 16800 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001919753832274071, + "loss": 1.3078, + "step": 16820 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019196464292522871, + "loss": 1.3873, + "step": 16840 + }, + { + "epoch": 0.9, + "eval_loss": 1.3950159549713135, + "eval_runtime": 49.9075, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 16850 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001919539026230503, + "loss": 1.3461, + "step": 16860 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019194316232087191, + "loss": 1.34, + "step": 16880 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019193295903380242, + "loss": 1.2995, + "step": 16900 + }, + { + "epoch": 0.9, + "eval_loss": 1.3943397998809814, + "eval_runtime": 49.8823, + "eval_samples_per_second": 60.142, + "eval_steps_per_second": 1.884, + "step": 16900 + }, + { + "epoch": 0.9, + "learning_rate": 0.000191922218731624, + "loss": 1.3687, + "step": 16920 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019191147842944562, + "loss": 1.3517, + "step": 16940 + }, + { + "epoch": 0.91, + "eval_loss": 1.395455241203308, + "eval_runtime": 49.9332, + "eval_samples_per_second": 60.08, + "eval_steps_per_second": 1.883, + "step": 16950 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001919007381272672, + "loss": 1.3321, + "step": 16960 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001918899978250888, + "loss": 1.2743, + "step": 16980 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019187925752291044, + "loss": 1.3643, + "step": 17000 + }, + { + "epoch": 0.91, + "eval_loss": 1.3946868181228638, + "eval_runtime": 51.5397, + "eval_samples_per_second": 58.208, + "eval_steps_per_second": 1.824, + "step": 17000 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019186851722073202, + "loss": 1.3694, + "step": 17020 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001918577769185536, + "loss": 1.3556, + "step": 17040 + }, + { + "epoch": 0.91, + "eval_loss": 1.3951468467712402, + "eval_runtime": 53.3639, + "eval_samples_per_second": 56.218, + "eval_steps_per_second": 1.761, + "step": 17050 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019184703661637522, + "loss": 1.29, + "step": 17060 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001918362963141968, + "loss": 1.3841, + "step": 17080 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019182555601201842, + "loss": 1.3496, + "step": 17100 + }, + { + "epoch": 0.91, + "eval_loss": 1.3944398164749146, + "eval_runtime": 53.236, + "eval_samples_per_second": 56.353, + "eval_steps_per_second": 1.766, + "step": 17100 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019181481570984, + "loss": 1.3039, + "step": 17120 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019180407540766162, + "loss": 1.3364, + "step": 17140 + }, + { + "epoch": 0.92, + "eval_loss": 1.3947904109954834, + "eval_runtime": 53.3188, + "eval_samples_per_second": 56.265, + "eval_steps_per_second": 1.763, + "step": 17150 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001917933351054832, + "loss": 1.3622, + "step": 17160 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001917825948033048, + "loss": 1.3147, + "step": 17180 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001917718545011264, + "loss": 1.3449, + "step": 17200 + }, + { + "epoch": 0.92, + "eval_loss": 1.3942961692810059, + "eval_runtime": 53.2529, + "eval_samples_per_second": 56.335, + "eval_steps_per_second": 1.765, + "step": 17200 + }, + { + "epoch": 0.92, + "learning_rate": 0.000191761114198948, + "loss": 1.3413, + "step": 17220 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001917503738967696, + "loss": 1.357, + "step": 17240 + }, + { + "epoch": 0.92, + "eval_loss": 1.3949023485183716, + "eval_runtime": 53.3355, + "eval_samples_per_second": 56.248, + "eval_steps_per_second": 1.762, + "step": 17250 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001917396335945912, + "loss": 1.3537, + "step": 17260 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019172889329241278, + "loss": 1.3437, + "step": 17280 + }, + { + "epoch": 0.92, + "learning_rate": 0.0001917181529902344, + "loss": 1.3773, + "step": 17300 + }, + { + "epoch": 0.92, + "eval_loss": 1.395226240158081, + "eval_runtime": 53.1999, + "eval_samples_per_second": 56.391, + "eval_steps_per_second": 1.767, + "step": 17300 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019170741268805598, + "loss": 1.3632, + "step": 17320 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001916966723858776, + "loss": 1.3362, + "step": 17340 + }, + { + "epoch": 0.93, + "eval_loss": 1.3946948051452637, + "eval_runtime": 53.2978, + "eval_samples_per_second": 56.287, + "eval_steps_per_second": 1.764, + "step": 17350 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019168593208369918, + "loss": 1.3758, + "step": 17360 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019167519178152077, + "loss": 1.3908, + "step": 17380 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019166445147934238, + "loss": 1.328, + "step": 17400 + }, + { + "epoch": 0.93, + "eval_loss": 1.3953857421875, + "eval_runtime": 53.208, + "eval_samples_per_second": 56.382, + "eval_steps_per_second": 1.767, + "step": 17400 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019165371117716397, + "loss": 1.32, + "step": 17420 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019164297087498558, + "loss": 1.3451, + "step": 17440 + }, + { + "epoch": 0.93, + "eval_loss": 1.395330786705017, + "eval_runtime": 53.3471, + "eval_samples_per_second": 56.235, + "eval_steps_per_second": 1.762, + "step": 17450 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019163223057280717, + "loss": 1.301, + "step": 17460 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019162149027062876, + "loss": 1.3185, + "step": 17480 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019161074996845037, + "loss": 1.3473, + "step": 17500 + }, + { + "epoch": 0.94, + "eval_loss": 1.3948390483856201, + "eval_runtime": 53.2209, + "eval_samples_per_second": 56.369, + "eval_steps_per_second": 1.766, + "step": 17500 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019160000966627196, + "loss": 1.3131, + "step": 17520 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019158926936409357, + "loss": 1.3577, + "step": 17540 + }, + { + "epoch": 0.94, + "eval_loss": 1.3952394723892212, + "eval_runtime": 53.3853, + "eval_samples_per_second": 56.195, + "eval_steps_per_second": 1.761, + "step": 17550 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019157852906191516, + "loss": 1.3114, + "step": 17560 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019156778875973674, + "loss": 1.301, + "step": 17580 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019155704845755836, + "loss": 1.3607, + "step": 17600 + }, + { + "epoch": 0.94, + "eval_loss": 1.3949131965637207, + "eval_runtime": 53.3252, + "eval_samples_per_second": 56.259, + "eval_steps_per_second": 1.763, + "step": 17600 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019154630815537994, + "loss": 1.3334, + "step": 17620 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019153556785320158, + "loss": 1.3423, + "step": 17640 + }, + { + "epoch": 0.94, + "eval_loss": 1.3943798542022705, + "eval_runtime": 53.2682, + "eval_samples_per_second": 56.319, + "eval_steps_per_second": 1.765, + "step": 17650 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019152482755102317, + "loss": 1.3629, + "step": 17660 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019151408724884476, + "loss": 1.3538, + "step": 17680 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019150334694666637, + "loss": 1.333, + "step": 17700 + }, + { + "epoch": 0.95, + "eval_loss": 1.3944674730300903, + "eval_runtime": 53.2835, + "eval_samples_per_second": 56.303, + "eval_steps_per_second": 1.764, + "step": 17700 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019149260664448796, + "loss": 1.2652, + "step": 17720 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019148186634230957, + "loss": 1.3316, + "step": 17740 + }, + { + "epoch": 0.95, + "eval_loss": 1.3947995901107788, + "eval_runtime": 53.3034, + "eval_samples_per_second": 56.282, + "eval_steps_per_second": 1.763, + "step": 17750 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019147112604013116, + "loss": 1.3615, + "step": 17760 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019146038573795274, + "loss": 1.3342, + "step": 17780 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019144964543577436, + "loss": 1.3409, + "step": 17800 + }, + { + "epoch": 0.95, + "eval_loss": 1.3946611881256104, + "eval_runtime": 53.2382, + "eval_samples_per_second": 56.351, + "eval_steps_per_second": 1.766, + "step": 17800 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019143890513359594, + "loss": 1.3304, + "step": 17820 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019142816483141756, + "loss": 1.3312, + "step": 17840 + }, + { + "epoch": 0.95, + "eval_loss": 1.3946311473846436, + "eval_runtime": 53.3606, + "eval_samples_per_second": 56.221, + "eval_steps_per_second": 1.762, + "step": 17850 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019141742452923914, + "loss": 1.269, + "step": 17860 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019140668422706073, + "loss": 1.3386, + "step": 17880 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019139594392488234, + "loss": 1.3227, + "step": 17900 + }, + { + "epoch": 0.96, + "eval_loss": 1.3940836191177368, + "eval_runtime": 53.3243, + "eval_samples_per_second": 56.26, + "eval_steps_per_second": 1.763, + "step": 17900 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019138520362270393, + "loss": 1.3971, + "step": 17920 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019137446332052554, + "loss": 1.3472, + "step": 17940 + }, + { + "epoch": 0.96, + "eval_loss": 1.395328402519226, + "eval_runtime": 53.2405, + "eval_samples_per_second": 56.348, + "eval_steps_per_second": 1.766, + "step": 17950 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019136372301834713, + "loss": 1.3437, + "step": 17960 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019135298271616871, + "loss": 1.3322, + "step": 17980 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019134224241399033, + "loss": 1.3706, + "step": 18000 + }, + { + "epoch": 0.96, + "eval_loss": 1.3958821296691895, + "eval_runtime": 53.3164, + "eval_samples_per_second": 56.268, + "eval_steps_per_second": 1.763, + "step": 18000 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019133150211181191, + "loss": 1.3879, + "step": 18020 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019132076180963353, + "loss": 1.3791, + "step": 18040 + }, + { + "epoch": 0.96, + "eval_loss": 1.3955086469650269, + "eval_runtime": 53.3188, + "eval_samples_per_second": 56.265, + "eval_steps_per_second": 1.763, + "step": 18050 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019131002150745511, + "loss": 1.3205, + "step": 18060 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001912992812052767, + "loss": 1.3406, + "step": 18080 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019128854090309831, + "loss": 1.3155, + "step": 18100 + }, + { + "epoch": 0.97, + "eval_loss": 1.3944337368011475, + "eval_runtime": 53.2487, + "eval_samples_per_second": 56.339, + "eval_steps_per_second": 1.765, + "step": 18100 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001912778006009199, + "loss": 1.3242, + "step": 18120 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019126706029874151, + "loss": 1.3449, + "step": 18140 + }, + { + "epoch": 0.97, + "eval_loss": 1.3948968648910522, + "eval_runtime": 53.2872, + "eval_samples_per_second": 56.299, + "eval_steps_per_second": 1.764, + "step": 18150 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001912563199965631, + "loss": 1.3425, + "step": 18160 + }, + { + "epoch": 0.97, + "learning_rate": 0.00019124557969438471, + "loss": 1.3077, + "step": 18180 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001912348393922063, + "loss": 1.363, + "step": 18200 + }, + { + "epoch": 0.97, + "eval_loss": 1.3944798707962036, + "eval_runtime": 53.2867, + "eval_samples_per_second": 56.299, + "eval_steps_per_second": 1.764, + "step": 18200 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001912240990900279, + "loss": 1.3584, + "step": 18220 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001912133587878495, + "loss": 1.3348, + "step": 18240 + }, + { + "epoch": 0.98, + "eval_loss": 1.3941653966903687, + "eval_runtime": 53.3131, + "eval_samples_per_second": 56.271, + "eval_steps_per_second": 1.763, + "step": 18250 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019120261848567111, + "loss": 1.3377, + "step": 18260 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001911918781834927, + "loss": 1.3091, + "step": 18280 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019118113788131431, + "loss": 1.3352, + "step": 18300 + }, + { + "epoch": 0.98, + "eval_loss": 1.3937102556228638, + "eval_runtime": 53.2965, + "eval_samples_per_second": 56.289, + "eval_steps_per_second": 1.764, + "step": 18300 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001911703975791359, + "loss": 1.3669, + "step": 18320 + }, + { + "epoch": 0.98, + "learning_rate": 0.00019115965727695751, + "loss": 1.3617, + "step": 18340 + }, + { + "epoch": 0.98, + "eval_loss": 1.3937897682189941, + "eval_runtime": 53.3594, + "eval_samples_per_second": 56.223, + "eval_steps_per_second": 1.762, + "step": 18350 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001911489169747791, + "loss": 1.3196, + "step": 18360 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001911381766726007, + "loss": 1.3334, + "step": 18380 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001911274363704223, + "loss": 1.3616, + "step": 18400 + }, + { + "epoch": 0.98, + "eval_loss": 1.3939082622528076, + "eval_runtime": 53.248, + "eval_samples_per_second": 56.34, + "eval_steps_per_second": 1.765, + "step": 18400 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001911166960682439, + "loss": 1.3552, + "step": 18420 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001911059557660655, + "loss": 1.3178, + "step": 18440 + }, + { + "epoch": 0.99, + "eval_loss": 1.3946744203567505, + "eval_runtime": 53.3048, + "eval_samples_per_second": 56.28, + "eval_steps_per_second": 1.763, + "step": 18450 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001910952154638871, + "loss": 1.3254, + "step": 18460 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019108447516170867, + "loss": 1.3107, + "step": 18480 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001910737348595303, + "loss": 1.3284, + "step": 18500 + }, + { + "epoch": 0.99, + "eval_loss": 1.3947073221206665, + "eval_runtime": 53.2779, + "eval_samples_per_second": 56.309, + "eval_steps_per_second": 1.764, + "step": 18500 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019106299455735187, + "loss": 1.3478, + "step": 18520 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001910522542551735, + "loss": 1.3324, + "step": 18540 + }, + { + "epoch": 0.99, + "eval_loss": 1.3934414386749268, + "eval_runtime": 53.3115, + "eval_samples_per_second": 56.273, + "eval_steps_per_second": 1.763, + "step": 18550 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019104151395299507, + "loss": 1.3273, + "step": 18560 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001910307736508167, + "loss": 1.3493, + "step": 18580 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019102003334863827, + "loss": 1.3364, + "step": 18600 + }, + { + "epoch": 0.99, + "eval_loss": 1.3933128118515015, + "eval_runtime": 53.3449, + "eval_samples_per_second": 56.238, + "eval_steps_per_second": 1.762, + "step": 18600 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019100929304645986, + "loss": 1.3448, + "step": 18620 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019099855274428147, + "loss": 1.3668, + "step": 18640 + }, + { + "epoch": 1.0, + "eval_loss": 1.3934011459350586, + "eval_runtime": 53.3012, + "eval_samples_per_second": 56.284, + "eval_steps_per_second": 1.764, + "step": 18650 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019098781244210306, + "loss": 1.3405, + "step": 18660 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019097707213992467, + "loss": 1.3624, + "step": 18680 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019096633183774626, + "loss": 1.335, + "step": 18700 + }, + { + "epoch": 1.0, + "eval_loss": 1.3939635753631592, + "eval_runtime": 53.2501, + "eval_samples_per_second": 56.338, + "eval_steps_per_second": 1.765, + "step": 18700 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019095559153556785, + "loss": 1.3386, + "step": 18720 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019094485123338946, + "loss": 1.3055, + "step": 18740 + }, + { + "epoch": 1.0, + "eval_loss": 1.3944073915481567, + "eval_runtime": 53.2526, + "eval_samples_per_second": 56.335, + "eval_steps_per_second": 1.765, + "step": 18750 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019093411093121105, + "loss": 1.3419, + "step": 18760 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019092337062903266, + "loss": 1.3359, + "step": 18780 + }, + { + "epoch": 1.0, + "learning_rate": 0.00019091263032685425, + "loss": 1.3629, + "step": 18800 + }, + { + "epoch": 1.0, + "eval_loss": 1.3944207429885864, + "eval_runtime": 53.2849, + "eval_samples_per_second": 56.301, + "eval_steps_per_second": 1.764, + "step": 18800 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019090189002467583, + "loss": 1.3263, + "step": 18820 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019089114972249745, + "loss": 1.3364, + "step": 18840 + }, + { + "epoch": 1.01, + "eval_loss": 1.394610047340393, + "eval_runtime": 53.2573, + "eval_samples_per_second": 56.33, + "eval_steps_per_second": 1.765, + "step": 18850 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019088040942031903, + "loss": 1.3744, + "step": 18860 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019086966911814065, + "loss": 1.343, + "step": 18880 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019085892881596226, + "loss": 1.3077, + "step": 18900 + }, + { + "epoch": 1.01, + "eval_loss": 1.393605351448059, + "eval_runtime": 53.3174, + "eval_samples_per_second": 56.267, + "eval_steps_per_second": 1.763, + "step": 18900 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019084818851378385, + "loss": 1.3005, + "step": 18920 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019083744821160546, + "loss": 1.3271, + "step": 18940 + }, + { + "epoch": 1.01, + "eval_loss": 1.3948262929916382, + "eval_runtime": 53.304, + "eval_samples_per_second": 56.281, + "eval_steps_per_second": 1.763, + "step": 18950 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019082670790942705, + "loss": 1.3305, + "step": 18960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019081596760724863, + "loss": 1.3139, + "step": 18980 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019080522730507025, + "loss": 1.3215, + "step": 19000 + }, + { + "epoch": 1.02, + "eval_loss": 1.3937315940856934, + "eval_runtime": 53.2895, + "eval_samples_per_second": 56.296, + "eval_steps_per_second": 1.764, + "step": 19000 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019079448700289183, + "loss": 1.3198, + "step": 19020 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019078374670071345, + "loss": 1.3102, + "step": 19040 + }, + { + "epoch": 1.02, + "eval_loss": 1.3946858644485474, + "eval_runtime": 53.307, + "eval_samples_per_second": 56.278, + "eval_steps_per_second": 1.763, + "step": 19050 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019077300639853503, + "loss": 1.3445, + "step": 19060 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019076226609635665, + "loss": 1.3052, + "step": 19080 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019075152579417823, + "loss": 1.3661, + "step": 19100 + }, + { + "epoch": 1.02, + "eval_loss": 1.3941792249679565, + "eval_runtime": 53.2371, + "eval_samples_per_second": 56.352, + "eval_steps_per_second": 1.766, + "step": 19100 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019074078549199982, + "loss": 1.3194, + "step": 19120 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019073004518982143, + "loss": 1.3014, + "step": 19140 + }, + { + "epoch": 1.02, + "eval_loss": 1.3942538499832153, + "eval_runtime": 53.3452, + "eval_samples_per_second": 56.237, + "eval_steps_per_second": 1.762, + "step": 19150 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019071930488764302, + "loss": 1.3853, + "step": 19160 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019070856458546463, + "loss": 1.3465, + "step": 19180 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019069782428328622, + "loss": 1.3253, + "step": 19200 + }, + { + "epoch": 1.03, + "eval_loss": 1.395198106765747, + "eval_runtime": 53.2693, + "eval_samples_per_second": 56.318, + "eval_steps_per_second": 1.765, + "step": 19200 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001906870839811078, + "loss": 1.3793, + "step": 19220 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019067688069403834, + "loss": 1.3506, + "step": 19240 + }, + { + "epoch": 1.03, + "eval_loss": 1.3944871425628662, + "eval_runtime": 53.3343, + "eval_samples_per_second": 56.249, + "eval_steps_per_second": 1.762, + "step": 19250 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019066614039185993, + "loss": 1.3374, + "step": 19260 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019065540008968151, + "loss": 1.309, + "step": 19280 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019064465978750313, + "loss": 1.3585, + "step": 19300 + }, + { + "epoch": 1.03, + "eval_loss": 1.395226240158081, + "eval_runtime": 53.2928, + "eval_samples_per_second": 56.293, + "eval_steps_per_second": 1.764, + "step": 19300 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019063391948532471, + "loss": 1.3599, + "step": 19320 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019062317918314633, + "loss": 1.3141, + "step": 19340 + }, + { + "epoch": 1.03, + "eval_loss": 1.3945953845977783, + "eval_runtime": 53.2692, + "eval_samples_per_second": 56.318, + "eval_steps_per_second": 1.765, + "step": 19350 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019061243888096794, + "loss": 1.3673, + "step": 19360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019060169857878953, + "loss": 1.338, + "step": 19380 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019059095827661114, + "loss": 1.3623, + "step": 19400 + }, + { + "epoch": 1.04, + "eval_loss": 1.3945705890655518, + "eval_runtime": 53.2984, + "eval_samples_per_second": 56.287, + "eval_steps_per_second": 1.764, + "step": 19400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019058021797443273, + "loss": 1.3656, + "step": 19420 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019056947767225431, + "loss": 1.3009, + "step": 19440 + }, + { + "epoch": 1.04, + "eval_loss": 1.3948311805725098, + "eval_runtime": 53.2755, + "eval_samples_per_second": 56.311, + "eval_steps_per_second": 1.764, + "step": 19450 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019055873737007593, + "loss": 1.3065, + "step": 19460 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019054799706789751, + "loss": 1.3115, + "step": 19480 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019053725676571913, + "loss": 1.3531, + "step": 19500 + }, + { + "epoch": 1.04, + "eval_loss": 1.3938305377960205, + "eval_runtime": 53.2725, + "eval_samples_per_second": 56.314, + "eval_steps_per_second": 1.765, + "step": 19500 + }, + { + "epoch": 1.04, + "learning_rate": 0.00019052651646354071, + "loss": 1.3467, + "step": 19520 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001905157761613623, + "loss": 1.3106, + "step": 19540 + }, + { + "epoch": 1.04, + "eval_loss": 1.3940069675445557, + "eval_runtime": 53.2839, + "eval_samples_per_second": 56.302, + "eval_steps_per_second": 1.764, + "step": 19550 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019050503585918391, + "loss": 1.345, + "step": 19560 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001904942955570055, + "loss": 1.3822, + "step": 19580 + }, + { + "epoch": 1.05, + "learning_rate": 0.00019048355525482711, + "loss": 1.3011, + "step": 19600 + }, + { + "epoch": 1.05, + "eval_loss": 1.3947548866271973, + "eval_runtime": 53.327, + "eval_samples_per_second": 56.257, + "eval_steps_per_second": 1.763, + "step": 19600 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001904728149526487, + "loss": 1.3236, + "step": 19620 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001904620746504703, + "loss": 1.3328, + "step": 19640 + }, + { + "epoch": 1.05, + "eval_loss": 1.3952617645263672, + "eval_runtime": 53.2948, + "eval_samples_per_second": 56.291, + "eval_steps_per_second": 1.764, + "step": 19650 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001904513343482919, + "loss": 1.3472, + "step": 19660 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001904405940461135, + "loss": 1.3257, + "step": 19680 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001904298537439351, + "loss": 1.3337, + "step": 19700 + }, + { + "epoch": 1.05, + "eval_loss": 1.3953675031661987, + "eval_runtime": 53.266, + "eval_samples_per_second": 56.321, + "eval_steps_per_second": 1.765, + "step": 19700 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001904191134417567, + "loss": 1.3463, + "step": 19720 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001904083731395783, + "loss": 1.3198, + "step": 19740 + }, + { + "epoch": 1.06, + "eval_loss": 1.3945834636688232, + "eval_runtime": 53.313, + "eval_samples_per_second": 56.271, + "eval_steps_per_second": 1.763, + "step": 19750 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001903976328373999, + "loss": 1.3042, + "step": 19760 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019038689253522147, + "loss": 1.3546, + "step": 19780 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001903761522330431, + "loss": 1.3307, + "step": 19800 + }, + { + "epoch": 1.06, + "eval_loss": 1.3953876495361328, + "eval_runtime": 53.3066, + "eval_samples_per_second": 56.278, + "eval_steps_per_second": 1.763, + "step": 19800 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019036541193086467, + "loss": 1.3741, + "step": 19820 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001903546716286863, + "loss": 1.3176, + "step": 19840 + }, + { + "epoch": 1.06, + "eval_loss": 1.3955553770065308, + "eval_runtime": 53.2937, + "eval_samples_per_second": 56.292, + "eval_steps_per_second": 1.764, + "step": 19850 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019034393132650787, + "loss": 1.3586, + "step": 19860 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019033319102432946, + "loss": 1.3176, + "step": 19880 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019032245072215107, + "loss": 1.3215, + "step": 19900 + }, + { + "epoch": 1.06, + "eval_loss": 1.3952226638793945, + "eval_runtime": 53.2682, + "eval_samples_per_second": 56.319, + "eval_steps_per_second": 1.765, + "step": 19900 + }, + { + "epoch": 1.06, + "learning_rate": 0.00019031171041997266, + "loss": 1.3255, + "step": 19920 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019030097011779427, + "loss": 1.3072, + "step": 19940 + }, + { + "epoch": 1.07, + "eval_loss": 1.3949054479599, + "eval_runtime": 53.2964, + "eval_samples_per_second": 56.289, + "eval_steps_per_second": 1.764, + "step": 19950 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019029022981561586, + "loss": 1.3479, + "step": 19960 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019027948951343745, + "loss": 1.317, + "step": 19980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001902687492112591, + "loss": 1.3354, + "step": 20000 + }, + { + "epoch": 1.07, + "eval_loss": 1.39454185962677, + "eval_runtime": 53.2971, + "eval_samples_per_second": 56.288, + "eval_steps_per_second": 1.764, + "step": 20000 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019025800890908067, + "loss": 1.3244, + "step": 20020 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019024726860690226, + "loss": 1.3621, + "step": 20040 + }, + { + "epoch": 1.07, + "eval_loss": 1.3944255113601685, + "eval_runtime": 53.3682, + "eval_samples_per_second": 56.213, + "eval_steps_per_second": 1.761, + "step": 20050 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019023652830472387, + "loss": 1.3042, + "step": 20060 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019022578800254546, + "loss": 1.2605, + "step": 20080 + }, + { + "epoch": 1.07, + "learning_rate": 0.00019021504770036707, + "loss": 1.3017, + "step": 20100 + }, + { + "epoch": 1.07, + "eval_loss": 1.395467758178711, + "eval_runtime": 53.2868, + "eval_samples_per_second": 56.299, + "eval_steps_per_second": 1.764, + "step": 20100 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019020430739818866, + "loss": 1.3345, + "step": 20120 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019019356709601027, + "loss": 1.3217, + "step": 20140 + }, + { + "epoch": 1.08, + "eval_loss": 1.3947306871414185, + "eval_runtime": 53.3338, + "eval_samples_per_second": 56.25, + "eval_steps_per_second": 1.762, + "step": 20150 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019018282679383186, + "loss": 1.3015, + "step": 20160 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019017208649165345, + "loss": 1.3573, + "step": 20180 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019016134618947506, + "loss": 1.3239, + "step": 20200 + }, + { + "epoch": 1.08, + "eval_loss": 1.394304633140564, + "eval_runtime": 53.3159, + "eval_samples_per_second": 56.268, + "eval_steps_per_second": 1.763, + "step": 20200 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019015060588729665, + "loss": 1.3383, + "step": 20220 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019013986558511826, + "loss": 1.3567, + "step": 20240 + }, + { + "epoch": 1.08, + "eval_loss": 1.3949447870254517, + "eval_runtime": 53.286, + "eval_samples_per_second": 56.3, + "eval_steps_per_second": 1.764, + "step": 20250 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019012912528293985, + "loss": 1.3843, + "step": 20260 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019011838498076143, + "loss": 1.3357, + "step": 20280 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019010764467858305, + "loss": 1.3138, + "step": 20300 + }, + { + "epoch": 1.08, + "eval_loss": 1.3950196504592896, + "eval_runtime": 53.2602, + "eval_samples_per_second": 56.327, + "eval_steps_per_second": 1.765, + "step": 20300 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019009690437640463, + "loss": 1.2994, + "step": 20320 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019008616407422625, + "loss": 1.3119, + "step": 20340 + }, + { + "epoch": 1.09, + "eval_loss": 1.3942936658859253, + "eval_runtime": 53.3464, + "eval_samples_per_second": 56.236, + "eval_steps_per_second": 1.762, + "step": 20350 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019007542377204783, + "loss": 1.3429, + "step": 20360 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019006468346986942, + "loss": 1.3365, + "step": 20380 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019005394316769103, + "loss": 1.3155, + "step": 20400 + }, + { + "epoch": 1.09, + "eval_loss": 1.3946025371551514, + "eval_runtime": 53.3252, + "eval_samples_per_second": 56.259, + "eval_steps_per_second": 1.763, + "step": 20400 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019004320286551262, + "loss": 1.315, + "step": 20420 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019003246256333423, + "loss": 1.3131, + "step": 20440 + }, + { + "epoch": 1.09, + "eval_loss": 1.394667625427246, + "eval_runtime": 53.3505, + "eval_samples_per_second": 56.232, + "eval_steps_per_second": 1.762, + "step": 20450 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019002172226115582, + "loss": 1.3526, + "step": 20460 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001900109819589774, + "loss": 1.3267, + "step": 20480 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019000024165679902, + "loss": 1.3102, + "step": 20500 + }, + { + "epoch": 1.1, + "eval_loss": 1.394671082496643, + "eval_runtime": 53.3613, + "eval_samples_per_second": 56.22, + "eval_steps_per_second": 1.762, + "step": 20500 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001899895013546206, + "loss": 1.3495, + "step": 20520 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018997876105244222, + "loss": 1.3449, + "step": 20540 + }, + { + "epoch": 1.1, + "eval_loss": 1.39447820186615, + "eval_runtime": 53.3328, + "eval_samples_per_second": 56.251, + "eval_steps_per_second": 1.763, + "step": 20550 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001899680207502638, + "loss": 1.3332, + "step": 20560 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001899572804480854, + "loss": 1.3432, + "step": 20580 + }, + { + "epoch": 1.1, + "learning_rate": 0.000189946540145907, + "loss": 1.312, + "step": 20600 + }, + { + "epoch": 1.1, + "eval_loss": 1.394857406616211, + "eval_runtime": 53.318, + "eval_samples_per_second": 56.266, + "eval_steps_per_second": 1.763, + "step": 20600 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018993633685883754, + "loss": 1.3569, + "step": 20620 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018992559655665913, + "loss": 1.2998, + "step": 20640 + }, + { + "epoch": 1.1, + "eval_loss": 1.3954495191574097, + "eval_runtime": 53.3831, + "eval_samples_per_second": 56.198, + "eval_steps_per_second": 1.761, + "step": 20650 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018991485625448074, + "loss": 1.3425, + "step": 20660 + }, + { + "epoch": 1.1, + "learning_rate": 0.00018990411595230233, + "loss": 1.3333, + "step": 20680 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018989337565012391, + "loss": 1.3849, + "step": 20700 + }, + { + "epoch": 1.11, + "eval_loss": 1.3940949440002441, + "eval_runtime": 53.2672, + "eval_samples_per_second": 56.32, + "eval_steps_per_second": 1.765, + "step": 20700 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018988263534794553, + "loss": 1.2803, + "step": 20720 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018987189504576711, + "loss": 1.3092, + "step": 20740 + }, + { + "epoch": 1.11, + "eval_loss": 1.3946436643600464, + "eval_runtime": 53.3576, + "eval_samples_per_second": 56.224, + "eval_steps_per_second": 1.762, + "step": 20750 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018986115474358873, + "loss": 1.3563, + "step": 20760 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018985041444141031, + "loss": 1.2947, + "step": 20780 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018983967413923193, + "loss": 1.3516, + "step": 20800 + }, + { + "epoch": 1.11, + "eval_loss": 1.3943217992782593, + "eval_runtime": 53.3099, + "eval_samples_per_second": 56.275, + "eval_steps_per_second": 1.763, + "step": 20800 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018982893383705351, + "loss": 1.3147, + "step": 20820 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001898181935348751, + "loss": 1.3344, + "step": 20840 + }, + { + "epoch": 1.11, + "eval_loss": 1.3943856954574585, + "eval_runtime": 49.9339, + "eval_samples_per_second": 60.079, + "eval_steps_per_second": 1.882, + "step": 20850 + }, + { + "epoch": 1.11, + "learning_rate": 0.00018980745323269671, + "loss": 1.3453, + "step": 20860 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897967129305183, + "loss": 1.2903, + "step": 20880 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018978597262833991, + "loss": 1.3583, + "step": 20900 + }, + { + "epoch": 1.12, + "eval_loss": 1.3942972421646118, + "eval_runtime": 49.8709, + "eval_samples_per_second": 60.155, + "eval_steps_per_second": 1.885, + "step": 20900 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897752323261615, + "loss": 1.312, + "step": 20920 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897644920239831, + "loss": 1.3049, + "step": 20940 + }, + { + "epoch": 1.12, + "eval_loss": 1.393723964691162, + "eval_runtime": 49.9513, + "eval_samples_per_second": 60.058, + "eval_steps_per_second": 1.882, + "step": 20950 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897537517218047, + "loss": 1.3463, + "step": 20960 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897430114196263, + "loss": 1.3335, + "step": 20980 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897322711174479, + "loss": 1.332, + "step": 21000 + }, + { + "epoch": 1.12, + "eval_loss": 1.3948601484298706, + "eval_runtime": 49.8828, + "eval_samples_per_second": 60.141, + "eval_steps_per_second": 1.884, + "step": 21000 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001897215308152695, + "loss": 1.3074, + "step": 21020 + }, + { + "epoch": 1.12, + "learning_rate": 0.00018971079051309107, + "loss": 1.3419, + "step": 21040 + }, + { + "epoch": 1.12, + "eval_loss": 1.3940149545669556, + "eval_runtime": 49.9346, + "eval_samples_per_second": 60.079, + "eval_steps_per_second": 1.882, + "step": 21050 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001897000502109127, + "loss": 1.3513, + "step": 21060 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018968930990873427, + "loss": 1.3396, + "step": 21080 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001896785696065559, + "loss": 1.3321, + "step": 21100 + }, + { + "epoch": 1.13, + "eval_loss": 1.3944331407546997, + "eval_runtime": 49.9103, + "eval_samples_per_second": 60.108, + "eval_steps_per_second": 1.883, + "step": 21100 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001896678293043775, + "loss": 1.3044, + "step": 21120 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001896570890021991, + "loss": 1.3447, + "step": 21140 + }, + { + "epoch": 1.13, + "eval_loss": 1.394074559211731, + "eval_runtime": 49.931, + "eval_samples_per_second": 60.083, + "eval_steps_per_second": 1.883, + "step": 21150 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001896463487000207, + "loss": 1.3281, + "step": 21160 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001896356083978423, + "loss": 1.3708, + "step": 21180 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001896248680956639, + "loss": 1.329, + "step": 21200 + }, + { + "epoch": 1.13, + "eval_loss": 1.3945494890213013, + "eval_runtime": 49.8915, + "eval_samples_per_second": 60.13, + "eval_steps_per_second": 1.884, + "step": 21200 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001896141277934855, + "loss": 1.3313, + "step": 21220 + }, + { + "epoch": 1.13, + "learning_rate": 0.00018960338749130707, + "loss": 1.3064, + "step": 21240 + }, + { + "epoch": 1.14, + "eval_loss": 1.3943963050842285, + "eval_runtime": 49.9422, + "eval_samples_per_second": 60.069, + "eval_steps_per_second": 1.882, + "step": 21250 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001895926471891287, + "loss": 1.3677, + "step": 21260 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018958190688695027, + "loss": 1.3212, + "step": 21280 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001895711665847719, + "loss": 1.321, + "step": 21300 + }, + { + "epoch": 1.14, + "eval_loss": 1.3941173553466797, + "eval_runtime": 49.8756, + "eval_samples_per_second": 60.15, + "eval_steps_per_second": 1.885, + "step": 21300 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018956042628259347, + "loss": 1.3148, + "step": 21320 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018954968598041506, + "loss": 1.3394, + "step": 21340 + }, + { + "epoch": 1.14, + "eval_loss": 1.3952860832214355, + "eval_runtime": 49.9435, + "eval_samples_per_second": 60.068, + "eval_steps_per_second": 1.882, + "step": 21350 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018953894567823667, + "loss": 1.3219, + "step": 21360 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018952820537605826, + "loss": 1.3146, + "step": 21380 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018951746507387987, + "loss": 1.3589, + "step": 21400 + }, + { + "epoch": 1.14, + "eval_loss": 1.394696593284607, + "eval_runtime": 49.9086, + "eval_samples_per_second": 60.11, + "eval_steps_per_second": 1.883, + "step": 21400 + }, + { + "epoch": 1.14, + "learning_rate": 0.00018950672477170146, + "loss": 1.3228, + "step": 21420 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018949598446952305, + "loss": 1.2984, + "step": 21440 + }, + { + "epoch": 1.15, + "eval_loss": 1.395227074623108, + "eval_runtime": 49.9256, + "eval_samples_per_second": 60.089, + "eval_steps_per_second": 1.883, + "step": 21450 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018948524416734466, + "loss": 1.3303, + "step": 21460 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018947450386516625, + "loss": 1.339, + "step": 21480 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018946376356298786, + "loss": 1.3297, + "step": 21500 + }, + { + "epoch": 1.15, + "eval_loss": 1.3939003944396973, + "eval_runtime": 49.8908, + "eval_samples_per_second": 60.131, + "eval_steps_per_second": 1.884, + "step": 21500 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018945302326080945, + "loss": 1.308, + "step": 21520 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018944228295863103, + "loss": 1.3539, + "step": 21540 + }, + { + "epoch": 1.15, + "eval_loss": 1.3942874670028687, + "eval_runtime": 49.9923, + "eval_samples_per_second": 60.009, + "eval_steps_per_second": 1.88, + "step": 21550 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018943154265645265, + "loss": 1.3464, + "step": 21560 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018942080235427423, + "loss": 1.3452, + "step": 21580 + }, + { + "epoch": 1.15, + "learning_rate": 0.00018941006205209585, + "loss": 1.3701, + "step": 21600 + }, + { + "epoch": 1.15, + "eval_loss": 1.3938068151474, + "eval_runtime": 49.9117, + "eval_samples_per_second": 60.106, + "eval_steps_per_second": 1.883, + "step": 21600 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018939932174991743, + "loss": 1.3529, + "step": 21620 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018938858144773902, + "loss": 1.3452, + "step": 21640 + }, + { + "epoch": 1.16, + "eval_loss": 1.3937488794326782, + "eval_runtime": 49.9344, + "eval_samples_per_second": 60.079, + "eval_steps_per_second": 1.882, + "step": 21650 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018937784114556063, + "loss": 1.3243, + "step": 21660 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018936710084338222, + "loss": 1.3161, + "step": 21680 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018935636054120383, + "loss": 1.3178, + "step": 21700 + }, + { + "epoch": 1.16, + "eval_loss": 1.3939458131790161, + "eval_runtime": 49.8961, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 21700 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018934562023902542, + "loss": 1.3084, + "step": 21720 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018933487993684703, + "loss": 1.3273, + "step": 21740 + }, + { + "epoch": 1.16, + "eval_loss": 1.394255518913269, + "eval_runtime": 49.8969, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 21750 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018932413963466865, + "loss": 1.299, + "step": 21760 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018931339933249023, + "loss": 1.3206, + "step": 21780 + }, + { + "epoch": 1.16, + "learning_rate": 0.00018930265903031185, + "loss": 1.3403, + "step": 21800 + }, + { + "epoch": 1.16, + "eval_loss": 1.393869161605835, + "eval_runtime": 49.8802, + "eval_samples_per_second": 60.144, + "eval_steps_per_second": 1.885, + "step": 21800 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018929191872813343, + "loss": 1.3328, + "step": 21820 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018928117842595502, + "loss": 1.2996, + "step": 21840 + }, + { + "epoch": 1.17, + "eval_loss": 1.3937753438949585, + "eval_runtime": 49.9775, + "eval_samples_per_second": 60.027, + "eval_steps_per_second": 1.881, + "step": 21850 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018927043812377663, + "loss": 1.3158, + "step": 21860 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018925969782159822, + "loss": 1.34, + "step": 21880 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018924895751941983, + "loss": 1.315, + "step": 21900 + }, + { + "epoch": 1.17, + "eval_loss": 1.393856406211853, + "eval_runtime": 49.8879, + "eval_samples_per_second": 60.135, + "eval_steps_per_second": 1.884, + "step": 21900 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018923821721724142, + "loss": 1.3036, + "step": 21920 + }, + { + "epoch": 1.17, + "learning_rate": 0.000189227476915063, + "loss": 1.324, + "step": 21940 + }, + { + "epoch": 1.17, + "eval_loss": 1.3935225009918213, + "eval_runtime": 49.9272, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 21950 + }, + { + "epoch": 1.17, + "learning_rate": 0.00018921673661288462, + "loss": 1.2792, + "step": 21960 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001892059963107062, + "loss": 1.285, + "step": 21980 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018919525600852782, + "loss": 1.3465, + "step": 22000 + }, + { + "epoch": 1.18, + "eval_loss": 1.3932766914367676, + "eval_runtime": 49.8918, + "eval_samples_per_second": 60.13, + "eval_steps_per_second": 1.884, + "step": 22000 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001891845157063494, + "loss": 1.3557, + "step": 22020 + }, + { + "epoch": 1.18, + "learning_rate": 0.000189173775404171, + "loss": 1.3295, + "step": 22040 + }, + { + "epoch": 1.18, + "eval_loss": 1.3936799764633179, + "eval_runtime": 49.9318, + "eval_samples_per_second": 60.082, + "eval_steps_per_second": 1.883, + "step": 22050 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001891630351019926, + "loss": 1.273, + "step": 22060 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001891522947998142, + "loss": 1.3366, + "step": 22080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001891415544976358, + "loss": 1.3403, + "step": 22100 + }, + { + "epoch": 1.18, + "eval_loss": 1.3937078714370728, + "eval_runtime": 49.8853, + "eval_samples_per_second": 60.138, + "eval_steps_per_second": 1.884, + "step": 22100 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001891308141954574, + "loss": 1.3315, + "step": 22120 + }, + { + "epoch": 1.18, + "learning_rate": 0.00018912007389327898, + "loss": 1.2958, + "step": 22140 + }, + { + "epoch": 1.18, + "eval_loss": 1.3935654163360596, + "eval_runtime": 49.9374, + "eval_samples_per_second": 60.075, + "eval_steps_per_second": 1.882, + "step": 22150 + }, + { + "epoch": 1.18, + "learning_rate": 0.0001891093335911006, + "loss": 1.3404, + "step": 22160 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018909859328892218, + "loss": 1.3616, + "step": 22180 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001890878529867438, + "loss": 1.346, + "step": 22200 + }, + { + "epoch": 1.19, + "eval_loss": 1.393470048904419, + "eval_runtime": 49.816, + "eval_samples_per_second": 60.222, + "eval_steps_per_second": 1.887, + "step": 22200 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018907711268456538, + "loss": 1.3741, + "step": 22220 + }, + { + "epoch": 1.19, + "learning_rate": 0.000189066372382387, + "loss": 1.3211, + "step": 22240 + }, + { + "epoch": 1.19, + "eval_loss": 1.393375277519226, + "eval_runtime": 49.9237, + "eval_samples_per_second": 60.092, + "eval_steps_per_second": 1.883, + "step": 22250 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018905563208020858, + "loss": 1.3711, + "step": 22260 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018904489177803017, + "loss": 1.3552, + "step": 22280 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018903415147585178, + "loss": 1.3305, + "step": 22300 + }, + { + "epoch": 1.19, + "eval_loss": 1.393662452697754, + "eval_runtime": 49.8797, + "eval_samples_per_second": 60.145, + "eval_steps_per_second": 1.885, + "step": 22300 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018902341117367337, + "loss": 1.3194, + "step": 22320 + }, + { + "epoch": 1.19, + "learning_rate": 0.00018901267087149498, + "loss": 1.3355, + "step": 22340 + }, + { + "epoch": 1.19, + "eval_loss": 1.3936777114868164, + "eval_runtime": 49.9571, + "eval_samples_per_second": 60.052, + "eval_steps_per_second": 1.882, + "step": 22350 + }, + { + "epoch": 1.19, + "learning_rate": 0.0001890019305693166, + "loss": 1.3547, + "step": 22360 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018899119026713818, + "loss": 1.3148, + "step": 22380 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001889804499649598, + "loss": 1.3456, + "step": 22400 + }, + { + "epoch": 1.2, + "eval_loss": 1.3941611051559448, + "eval_runtime": 49.9059, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 22400 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018896970966278138, + "loss": 1.3419, + "step": 22420 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018895896936060297, + "loss": 1.3451, + "step": 22440 + }, + { + "epoch": 1.2, + "eval_loss": 1.394294261932373, + "eval_runtime": 49.9338, + "eval_samples_per_second": 60.079, + "eval_steps_per_second": 1.882, + "step": 22450 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018894822905842458, + "loss": 1.346, + "step": 22460 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018893748875624617, + "loss": 1.3597, + "step": 22480 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018892674845406778, + "loss": 1.3069, + "step": 22500 + }, + { + "epoch": 1.2, + "eval_loss": 1.3939677476882935, + "eval_runtime": 49.9063, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 22500 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018891600815188937, + "loss": 1.3063, + "step": 22520 + }, + { + "epoch": 1.2, + "learning_rate": 0.00018890526784971095, + "loss": 1.3387, + "step": 22540 + }, + { + "epoch": 1.2, + "eval_loss": 1.3937537670135498, + "eval_runtime": 49.9009, + "eval_samples_per_second": 60.119, + "eval_steps_per_second": 1.884, + "step": 22550 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018889452754753257, + "loss": 1.3412, + "step": 22560 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018888378724535415, + "loss": 1.323, + "step": 22580 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018887304694317577, + "loss": 1.2893, + "step": 22600 + }, + { + "epoch": 1.21, + "eval_loss": 1.3940309286117554, + "eval_runtime": 49.8846, + "eval_samples_per_second": 60.139, + "eval_steps_per_second": 1.884, + "step": 22600 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018886230664099735, + "loss": 1.3087, + "step": 22620 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018885156633881897, + "loss": 1.3158, + "step": 22640 + }, + { + "epoch": 1.21, + "eval_loss": 1.393900752067566, + "eval_runtime": 49.9827, + "eval_samples_per_second": 60.021, + "eval_steps_per_second": 1.881, + "step": 22650 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018884082603664055, + "loss": 1.3096, + "step": 22660 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018883008573446214, + "loss": 1.2929, + "step": 22680 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018881934543228375, + "loss": 1.3141, + "step": 22700 + }, + { + "epoch": 1.21, + "eval_loss": 1.3940834999084473, + "eval_runtime": 49.8973, + "eval_samples_per_second": 60.123, + "eval_steps_per_second": 1.884, + "step": 22700 + }, + { + "epoch": 1.21, + "learning_rate": 0.00018880860513010534, + "loss": 1.3392, + "step": 22720 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018879786482792695, + "loss": 1.2776, + "step": 22740 + }, + { + "epoch": 1.22, + "eval_loss": 1.3944599628448486, + "eval_runtime": 49.9134, + "eval_samples_per_second": 60.104, + "eval_steps_per_second": 1.883, + "step": 22750 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018878712452574854, + "loss": 1.3268, + "step": 22760 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018877638422357013, + "loss": 1.3086, + "step": 22780 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018876564392139174, + "loss": 1.2932, + "step": 22800 + }, + { + "epoch": 1.22, + "eval_loss": 1.39402437210083, + "eval_runtime": 49.8649, + "eval_samples_per_second": 60.163, + "eval_steps_per_second": 1.885, + "step": 22800 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018875490361921333, + "loss": 1.3709, + "step": 22820 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018874416331703494, + "loss": 1.381, + "step": 22840 + }, + { + "epoch": 1.22, + "eval_loss": 1.3939707279205322, + "eval_runtime": 49.9399, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 22850 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018873342301485653, + "loss": 1.318, + "step": 22860 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001887226827126781, + "loss": 1.3515, + "step": 22880 + }, + { + "epoch": 1.22, + "learning_rate": 0.00018871194241049973, + "loss": 1.332, + "step": 22900 + }, + { + "epoch": 1.22, + "eval_loss": 1.3941855430603027, + "eval_runtime": 49.8977, + "eval_samples_per_second": 60.123, + "eval_steps_per_second": 1.884, + "step": 22900 + }, + { + "epoch": 1.22, + "learning_rate": 0.0001887012021083213, + "loss": 1.3107, + "step": 22920 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018869046180614293, + "loss": 1.2985, + "step": 22940 + }, + { + "epoch": 1.23, + "eval_loss": 1.393490195274353, + "eval_runtime": 49.9127, + "eval_samples_per_second": 60.105, + "eval_steps_per_second": 1.883, + "step": 22950 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001886797215039645, + "loss": 1.3311, + "step": 22960 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001886689812017861, + "loss": 1.3189, + "step": 22980 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018865824089960774, + "loss": 1.3045, + "step": 23000 + }, + { + "epoch": 1.23, + "eval_loss": 1.3936126232147217, + "eval_runtime": 49.8816, + "eval_samples_per_second": 60.142, + "eval_steps_per_second": 1.884, + "step": 23000 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018864750059742933, + "loss": 1.3236, + "step": 23020 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001886367602952509, + "loss": 1.3055, + "step": 23040 + }, + { + "epoch": 1.23, + "eval_loss": 1.3945151567459106, + "eval_runtime": 49.9284, + "eval_samples_per_second": 60.086, + "eval_steps_per_second": 1.883, + "step": 23050 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018862601999307253, + "loss": 1.3144, + "step": 23060 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001886152796908941, + "loss": 1.3563, + "step": 23080 + }, + { + "epoch": 1.23, + "learning_rate": 0.00018860453938871573, + "loss": 1.3262, + "step": 23100 + }, + { + "epoch": 1.23, + "eval_loss": 1.3944690227508545, + "eval_runtime": 49.917, + "eval_samples_per_second": 60.1, + "eval_steps_per_second": 1.883, + "step": 23100 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001885937990865373, + "loss": 1.2865, + "step": 23120 + }, + { + "epoch": 1.24, + "learning_rate": 0.00018858305878435893, + "loss": 1.358, + "step": 23140 + }, + { + "epoch": 1.24, + "eval_loss": 1.3941535949707031, + "eval_runtime": 49.9063, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 23150 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001885723184821805, + "loss": 1.327, + "step": 23160 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001885615781800021, + "loss": 1.3815, + "step": 23180 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001885508378778237, + "loss": 1.3377, + "step": 23200 + }, + { + "epoch": 1.24, + "eval_loss": 1.3935812711715698, + "eval_runtime": 49.8915, + "eval_samples_per_second": 60.13, + "eval_steps_per_second": 1.884, + "step": 23200 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001885400975756453, + "loss": 1.3281, + "step": 23220 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001885293572734669, + "loss": 1.3178, + "step": 23240 + }, + { + "epoch": 1.24, + "eval_loss": 1.3938349485397339, + "eval_runtime": 49.9329, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 23250 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001885186169712885, + "loss": 1.3363, + "step": 23260 + }, + { + "epoch": 1.24, + "learning_rate": 0.00018850787666911008, + "loss": 1.3189, + "step": 23280 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001884971363669317, + "loss": 1.3143, + "step": 23300 + }, + { + "epoch": 1.24, + "eval_loss": 1.3938268423080444, + "eval_runtime": 49.8546, + "eval_samples_per_second": 60.175, + "eval_steps_per_second": 1.885, + "step": 23300 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018848639606475328, + "loss": 1.3378, + "step": 23320 + }, + { + "epoch": 1.25, + "learning_rate": 0.0001884756557625749, + "loss": 1.3521, + "step": 23340 + }, + { + "epoch": 1.25, + "eval_loss": 1.3935599327087402, + "eval_runtime": 49.8847, + "eval_samples_per_second": 60.139, + "eval_steps_per_second": 1.884, + "step": 23350 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018846491546039648, + "loss": 1.331, + "step": 23360 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018845417515821807, + "loss": 1.3347, + "step": 23380 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018844343485603968, + "loss": 1.2857, + "step": 23400 + }, + { + "epoch": 1.25, + "eval_loss": 1.3932883739471436, + "eval_runtime": 49.905, + "eval_samples_per_second": 60.114, + "eval_steps_per_second": 1.884, + "step": 23400 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018843269455386127, + "loss": 1.3413, + "step": 23420 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018842195425168288, + "loss": 1.3461, + "step": 23440 + }, + { + "epoch": 1.25, + "eval_loss": 1.3934977054595947, + "eval_runtime": 49.912, + "eval_samples_per_second": 60.106, + "eval_steps_per_second": 1.883, + "step": 23450 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018841121394950447, + "loss": 1.3177, + "step": 23460 + }, + { + "epoch": 1.25, + "learning_rate": 0.00018840047364732606, + "loss": 1.3664, + "step": 23480 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018838973334514767, + "loss": 1.371, + "step": 23500 + }, + { + "epoch": 1.26, + "eval_loss": 1.3936991691589355, + "eval_runtime": 49.8492, + "eval_samples_per_second": 60.182, + "eval_steps_per_second": 1.886, + "step": 23500 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018837899304296926, + "loss": 1.3165, + "step": 23520 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018836825274079087, + "loss": 1.338, + "step": 23540 + }, + { + "epoch": 1.26, + "eval_loss": 1.3927234411239624, + "eval_runtime": 49.9193, + "eval_samples_per_second": 60.097, + "eval_steps_per_second": 1.883, + "step": 23550 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018835751243861246, + "loss": 1.2839, + "step": 23560 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018834677213643407, + "loss": 1.3623, + "step": 23580 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018833603183425566, + "loss": 1.3299, + "step": 23600 + }, + { + "epoch": 1.26, + "eval_loss": 1.392892599105835, + "eval_runtime": 49.8594, + "eval_samples_per_second": 60.169, + "eval_steps_per_second": 1.885, + "step": 23600 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018832529153207727, + "loss": 1.326, + "step": 23620 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018831455122989888, + "loss": 1.3473, + "step": 23640 + }, + { + "epoch": 1.26, + "eval_loss": 1.392406940460205, + "eval_runtime": 49.9562, + "eval_samples_per_second": 60.053, + "eval_steps_per_second": 1.882, + "step": 23650 + }, + { + "epoch": 1.26, + "learning_rate": 0.00018830381092772047, + "loss": 1.341, + "step": 23660 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018829307062554206, + "loss": 1.2912, + "step": 23680 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018828233032336367, + "loss": 1.3121, + "step": 23700 + }, + { + "epoch": 1.27, + "eval_loss": 1.3924084901809692, + "eval_runtime": 49.8918, + "eval_samples_per_second": 60.13, + "eval_steps_per_second": 1.884, + "step": 23700 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018827159002118526, + "loss": 1.3299, + "step": 23720 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018826084971900687, + "loss": 1.3266, + "step": 23740 + }, + { + "epoch": 1.27, + "eval_loss": 1.3923590183258057, + "eval_runtime": 49.9073, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 23750 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018825010941682846, + "loss": 1.3247, + "step": 23760 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018823936911465004, + "loss": 1.3357, + "step": 23780 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018822862881247166, + "loss": 1.3269, + "step": 23800 + }, + { + "epoch": 1.27, + "eval_loss": 1.3921170234680176, + "eval_runtime": 49.8632, + "eval_samples_per_second": 60.165, + "eval_steps_per_second": 1.885, + "step": 23800 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018821788851029324, + "loss": 1.2837, + "step": 23820 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018820714820811486, + "loss": 1.3065, + "step": 23840 + }, + { + "epoch": 1.27, + "eval_loss": 1.3927005529403687, + "eval_runtime": 49.9104, + "eval_samples_per_second": 60.108, + "eval_steps_per_second": 1.883, + "step": 23850 + }, + { + "epoch": 1.27, + "learning_rate": 0.00018819640790593644, + "loss": 1.3199, + "step": 23860 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018818566760375803, + "loss": 1.3366, + "step": 23880 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018817492730157964, + "loss": 1.3241, + "step": 23900 + }, + { + "epoch": 1.28, + "eval_loss": 1.3931777477264404, + "eval_runtime": 49.8758, + "eval_samples_per_second": 60.149, + "eval_steps_per_second": 1.885, + "step": 23900 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018816418699940123, + "loss": 1.2909, + "step": 23920 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018815344669722284, + "loss": 1.3167, + "step": 23940 + }, + { + "epoch": 1.28, + "eval_loss": 1.3930566310882568, + "eval_runtime": 49.9278, + "eval_samples_per_second": 60.087, + "eval_steps_per_second": 1.883, + "step": 23950 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018814270639504443, + "loss": 1.3626, + "step": 23960 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018813196609286602, + "loss": 1.3775, + "step": 23980 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018812122579068763, + "loss": 1.289, + "step": 24000 + }, + { + "epoch": 1.28, + "eval_loss": 1.3926464319229126, + "eval_runtime": 49.8874, + "eval_samples_per_second": 60.135, + "eval_steps_per_second": 1.884, + "step": 24000 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018811048548850922, + "loss": 1.3247, + "step": 24020 + }, + { + "epoch": 1.28, + "learning_rate": 0.00018809974518633083, + "loss": 1.387, + "step": 24040 + }, + { + "epoch": 1.29, + "eval_loss": 1.3932690620422363, + "eval_runtime": 49.9239, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 24050 + }, + { + "epoch": 1.29, + "learning_rate": 0.00018808900488415242, + "loss": 1.3109, + "step": 24060 + }, + { + "epoch": 1.29, + "learning_rate": 0.00018807826458197403, + "loss": 1.3242, + "step": 24080 + }, + { + "epoch": 1.29, + "learning_rate": 0.00018806752427979562, + "loss": 1.3412, + "step": 24100 + }, + { + "epoch": 1.29, + "eval_loss": 1.3931074142456055, + "eval_runtime": 49.8986, + "eval_samples_per_second": 60.122, + "eval_steps_per_second": 1.884, + "step": 24100 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001880567839776172, + "loss": 1.3798, + "step": 24120 + }, + { + "epoch": 1.29, + "learning_rate": 0.00018804604367543882, + "loss": 1.3796, + "step": 24140 + }, + { + "epoch": 1.29, + "eval_loss": 1.3929234743118286, + "eval_runtime": 49.9091, + "eval_samples_per_second": 60.109, + "eval_steps_per_second": 1.883, + "step": 24150 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001880353033732604, + "loss": 1.3133, + "step": 24160 + }, + { + "epoch": 1.29, + "learning_rate": 0.00018802456307108202, + "loss": 1.3307, + "step": 24180 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001880138227689036, + "loss": 1.3138, + "step": 24200 + }, + { + "epoch": 1.29, + "eval_loss": 1.3924319744110107, + "eval_runtime": 49.897, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 24200 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001880030824667252, + "loss": 1.3307, + "step": 24220 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001879923421645468, + "loss": 1.3521, + "step": 24240 + }, + { + "epoch": 1.3, + "eval_loss": 1.3926700353622437, + "eval_runtime": 49.9085, + "eval_samples_per_second": 60.11, + "eval_steps_per_second": 1.883, + "step": 24250 + }, + { + "epoch": 1.3, + "learning_rate": 0.00018798160186236842, + "loss": 1.3207, + "step": 24260 + }, + { + "epoch": 1.3, + "learning_rate": 0.00018797086156019, + "loss": 1.2797, + "step": 24280 + }, + { + "epoch": 1.3, + "learning_rate": 0.00018796012125801162, + "loss": 1.3234, + "step": 24300 + }, + { + "epoch": 1.3, + "eval_loss": 1.392791509628296, + "eval_runtime": 49.8988, + "eval_samples_per_second": 60.122, + "eval_steps_per_second": 1.884, + "step": 24300 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001879493809558332, + "loss": 1.3436, + "step": 24320 + }, + { + "epoch": 1.3, + "learning_rate": 0.00018793864065365482, + "loss": 1.3685, + "step": 24340 + }, + { + "epoch": 1.3, + "eval_loss": 1.3929072618484497, + "eval_runtime": 49.9294, + "eval_samples_per_second": 60.085, + "eval_steps_per_second": 1.883, + "step": 24350 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001879279003514764, + "loss": 1.3149, + "step": 24360 + }, + { + "epoch": 1.3, + "learning_rate": 0.000187917160049298, + "loss": 1.341, + "step": 24380 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001879064197471196, + "loss": 1.3232, + "step": 24400 + }, + { + "epoch": 1.3, + "eval_loss": 1.3928802013397217, + "eval_runtime": 49.9403, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 24400 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001878956794449412, + "loss": 1.3352, + "step": 24420 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001878849391427628, + "loss": 1.3423, + "step": 24440 + }, + { + "epoch": 1.31, + "eval_loss": 1.3924449682235718, + "eval_runtime": 49.9373, + "eval_samples_per_second": 60.075, + "eval_steps_per_second": 1.882, + "step": 24450 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001878741988405844, + "loss": 1.3055, + "step": 24460 + }, + { + "epoch": 1.31, + "learning_rate": 0.000187863458538406, + "loss": 1.3349, + "step": 24480 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001878527182362276, + "loss": 1.3377, + "step": 24500 + }, + { + "epoch": 1.31, + "eval_loss": 1.3933476209640503, + "eval_runtime": 49.8969, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 24500 + }, + { + "epoch": 1.31, + "learning_rate": 0.00018784197793404918, + "loss": 1.336, + "step": 24520 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001878312376318708, + "loss": 1.3474, + "step": 24540 + }, + { + "epoch": 1.31, + "eval_loss": 1.3931814432144165, + "eval_runtime": 49.9181, + "eval_samples_per_second": 60.098, + "eval_steps_per_second": 1.883, + "step": 24550 + }, + { + "epoch": 1.31, + "learning_rate": 0.00018782049732969238, + "loss": 1.3556, + "step": 24560 + }, + { + "epoch": 1.31, + "learning_rate": 0.000187809757027514, + "loss": 1.3238, + "step": 24580 + }, + { + "epoch": 1.31, + "learning_rate": 0.00018779901672533558, + "loss": 1.3364, + "step": 24600 + }, + { + "epoch": 1.31, + "eval_loss": 1.3931456804275513, + "eval_runtime": 49.9019, + "eval_samples_per_second": 60.118, + "eval_steps_per_second": 1.884, + "step": 24600 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018778827642315716, + "loss": 1.2976, + "step": 24620 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018777753612097878, + "loss": 1.3303, + "step": 24640 + }, + { + "epoch": 1.32, + "eval_loss": 1.3942646980285645, + "eval_runtime": 49.9029, + "eval_samples_per_second": 60.117, + "eval_steps_per_second": 1.884, + "step": 24650 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018776679581880036, + "loss": 1.3133, + "step": 24660 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018775605551662198, + "loss": 1.2781, + "step": 24680 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018774531521444356, + "loss": 1.3457, + "step": 24700 + }, + { + "epoch": 1.32, + "eval_loss": 1.393941044807434, + "eval_runtime": 49.8674, + "eval_samples_per_second": 60.16, + "eval_steps_per_second": 1.885, + "step": 24700 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018773457491226515, + "loss": 1.3208, + "step": 24720 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018772383461008676, + "loss": 1.3013, + "step": 24740 + }, + { + "epoch": 1.32, + "eval_loss": 1.393032431602478, + "eval_runtime": 49.9451, + "eval_samples_per_second": 60.066, + "eval_steps_per_second": 1.882, + "step": 24750 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018771309430790835, + "loss": 1.3752, + "step": 24760 + }, + { + "epoch": 1.32, + "learning_rate": 0.00018770235400572996, + "loss": 1.3651, + "step": 24780 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018769161370355155, + "loss": 1.2917, + "step": 24800 + }, + { + "epoch": 1.33, + "eval_loss": 1.3928227424621582, + "eval_runtime": 49.9307, + "eval_samples_per_second": 60.083, + "eval_steps_per_second": 1.883, + "step": 24800 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018768087340137314, + "loss": 1.3655, + "step": 24820 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018767013309919475, + "loss": 1.3621, + "step": 24840 + }, + { + "epoch": 1.33, + "eval_loss": 1.3928583860397339, + "eval_runtime": 49.9119, + "eval_samples_per_second": 60.106, + "eval_steps_per_second": 1.883, + "step": 24850 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018765939279701634, + "loss": 1.3057, + "step": 24860 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018764865249483795, + "loss": 1.3112, + "step": 24880 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018763791219265956, + "loss": 1.3373, + "step": 24900 + }, + { + "epoch": 1.33, + "eval_loss": 1.39380943775177, + "eval_runtime": 49.9554, + "eval_samples_per_second": 60.054, + "eval_steps_per_second": 1.882, + "step": 24900 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018762717189048115, + "loss": 1.3207, + "step": 24920 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018761643158830276, + "loss": 1.3272, + "step": 24940 + }, + { + "epoch": 1.33, + "eval_loss": 1.3933907747268677, + "eval_runtime": 49.9328, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 24950 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018760622830123327, + "loss": 1.3419, + "step": 24960 + }, + { + "epoch": 1.33, + "learning_rate": 0.00018759548799905486, + "loss": 1.3051, + "step": 24980 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018758474769687647, + "loss": 1.2969, + "step": 25000 + }, + { + "epoch": 1.34, + "eval_loss": 1.3930931091308594, + "eval_runtime": 49.9142, + "eval_samples_per_second": 60.103, + "eval_steps_per_second": 1.883, + "step": 25000 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018757400739469806, + "loss": 1.332, + "step": 25020 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018756326709251964, + "loss": 1.3231, + "step": 25040 + }, + { + "epoch": 1.34, + "eval_loss": 1.3935271501541138, + "eval_runtime": 49.9445, + "eval_samples_per_second": 60.067, + "eval_steps_per_second": 1.882, + "step": 25050 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018755252679034126, + "loss": 1.3348, + "step": 25060 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018754178648816284, + "loss": 1.3147, + "step": 25080 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018753104618598446, + "loss": 1.3067, + "step": 25100 + }, + { + "epoch": 1.34, + "eval_loss": 1.3923522233963013, + "eval_runtime": 49.8579, + "eval_samples_per_second": 60.171, + "eval_steps_per_second": 1.885, + "step": 25100 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018752030588380604, + "loss": 1.3578, + "step": 25120 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018750956558162766, + "loss": 1.3334, + "step": 25140 + }, + { + "epoch": 1.34, + "eval_loss": 1.3927327394485474, + "eval_runtime": 49.9361, + "eval_samples_per_second": 60.077, + "eval_steps_per_second": 1.882, + "step": 25150 + }, + { + "epoch": 1.34, + "learning_rate": 0.00018749882527944924, + "loss": 1.3283, + "step": 25160 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018748808497727083, + "loss": 1.3298, + "step": 25180 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018747734467509244, + "loss": 1.3133, + "step": 25200 + }, + { + "epoch": 1.35, + "eval_loss": 1.3926618099212646, + "eval_runtime": 49.8902, + "eval_samples_per_second": 60.132, + "eval_steps_per_second": 1.884, + "step": 25200 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018746660437291403, + "loss": 1.3015, + "step": 25220 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018745586407073564, + "loss": 1.3212, + "step": 25240 + }, + { + "epoch": 1.35, + "eval_loss": 1.3931440114974976, + "eval_runtime": 49.9256, + "eval_samples_per_second": 60.089, + "eval_steps_per_second": 1.883, + "step": 25250 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018744512376855723, + "loss": 1.3574, + "step": 25260 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018743438346637882, + "loss": 1.3192, + "step": 25280 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018742364316420043, + "loss": 1.3267, + "step": 25300 + }, + { + "epoch": 1.35, + "eval_loss": 1.3930535316467285, + "eval_runtime": 49.9314, + "eval_samples_per_second": 60.082, + "eval_steps_per_second": 1.883, + "step": 25300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018741290286202202, + "loss": 1.3137, + "step": 25320 + }, + { + "epoch": 1.35, + "learning_rate": 0.00018740216255984363, + "loss": 1.2821, + "step": 25340 + }, + { + "epoch": 1.35, + "eval_loss": 1.3924797773361206, + "eval_runtime": 49.9197, + "eval_samples_per_second": 60.097, + "eval_steps_per_second": 1.883, + "step": 25350 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018739142225766524, + "loss": 1.3467, + "step": 25360 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018738068195548683, + "loss": 1.3472, + "step": 25380 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018736994165330844, + "loss": 1.3496, + "step": 25400 + }, + { + "epoch": 1.36, + "eval_loss": 1.3928779363632202, + "eval_runtime": 49.9035, + "eval_samples_per_second": 60.116, + "eval_steps_per_second": 1.884, + "step": 25400 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018735920135113003, + "loss": 1.3438, + "step": 25420 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018734846104895162, + "loss": 1.2587, + "step": 25440 + }, + { + "epoch": 1.36, + "eval_loss": 1.3934276103973389, + "eval_runtime": 49.9277, + "eval_samples_per_second": 60.087, + "eval_steps_per_second": 1.883, + "step": 25450 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018733772074677323, + "loss": 1.3466, + "step": 25460 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018732698044459482, + "loss": 1.3482, + "step": 25480 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018731624014241643, + "loss": 1.3167, + "step": 25500 + }, + { + "epoch": 1.36, + "eval_loss": 1.3931827545166016, + "eval_runtime": 49.8943, + "eval_samples_per_second": 60.127, + "eval_steps_per_second": 1.884, + "step": 25500 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018730549984023802, + "loss": 1.3419, + "step": 25520 + }, + { + "epoch": 1.36, + "learning_rate": 0.00018729529655316853, + "loss": 1.3171, + "step": 25540 + }, + { + "epoch": 1.37, + "eval_loss": 1.393099308013916, + "eval_runtime": 49.9382, + "eval_samples_per_second": 60.074, + "eval_steps_per_second": 1.882, + "step": 25550 + }, + { + "epoch": 1.37, + "learning_rate": 0.00018728455625099014, + "loss": 1.3, + "step": 25560 + }, + { + "epoch": 1.37, + "learning_rate": 0.00018727381594881173, + "loss": 1.3199, + "step": 25580 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001872630756466333, + "loss": 1.3207, + "step": 25600 + }, + { + "epoch": 1.37, + "eval_loss": 1.3924485445022583, + "eval_runtime": 49.915, + "eval_samples_per_second": 60.102, + "eval_steps_per_second": 1.883, + "step": 25600 + }, + { + "epoch": 1.37, + "learning_rate": 0.00018725233534445493, + "loss": 1.2997, + "step": 25620 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001872415950422765, + "loss": 1.3398, + "step": 25640 + }, + { + "epoch": 1.37, + "eval_loss": 1.3940945863723755, + "eval_runtime": 49.9277, + "eval_samples_per_second": 60.087, + "eval_steps_per_second": 1.883, + "step": 25650 + }, + { + "epoch": 1.37, + "learning_rate": 0.00018723085474009813, + "loss": 1.3315, + "step": 25660 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001872201144379197, + "loss": 1.3389, + "step": 25680 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001872093741357413, + "loss": 1.3481, + "step": 25700 + }, + { + "epoch": 1.37, + "eval_loss": 1.3938196897506714, + "eval_runtime": 49.8983, + "eval_samples_per_second": 60.122, + "eval_steps_per_second": 1.884, + "step": 25700 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001871986338335629, + "loss": 1.3122, + "step": 25720 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001871878935313845, + "loss": 1.3473, + "step": 25740 + }, + { + "epoch": 1.38, + "eval_loss": 1.3930983543395996, + "eval_runtime": 49.9502, + "eval_samples_per_second": 60.06, + "eval_steps_per_second": 1.882, + "step": 25750 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001871771532292061, + "loss": 1.3158, + "step": 25760 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001871664129270277, + "loss": 1.2729, + "step": 25780 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001871556726248493, + "loss": 1.3306, + "step": 25800 + }, + { + "epoch": 1.38, + "eval_loss": 1.3933429718017578, + "eval_runtime": 49.8803, + "eval_samples_per_second": 60.144, + "eval_steps_per_second": 1.885, + "step": 25800 + }, + { + "epoch": 1.38, + "learning_rate": 0.00018714493232267093, + "loss": 1.3314, + "step": 25820 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001871341920204925, + "loss": 1.2822, + "step": 25840 + }, + { + "epoch": 1.38, + "eval_loss": 1.3929928541183472, + "eval_runtime": 49.9522, + "eval_samples_per_second": 60.057, + "eval_steps_per_second": 1.882, + "step": 25850 + }, + { + "epoch": 1.38, + "learning_rate": 0.00018712345171831413, + "loss": 1.3469, + "step": 25860 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001871127114161357, + "loss": 1.3227, + "step": 25880 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001871019711139573, + "loss": 1.3566, + "step": 25900 + }, + { + "epoch": 1.38, + "eval_loss": 1.393803596496582, + "eval_runtime": 49.9071, + "eval_samples_per_second": 60.112, + "eval_steps_per_second": 1.883, + "step": 25900 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001870912308117789, + "loss": 1.3172, + "step": 25920 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001870804905096005, + "loss": 1.3102, + "step": 25940 + }, + { + "epoch": 1.39, + "eval_loss": 1.3934656381607056, + "eval_runtime": 49.9694, + "eval_samples_per_second": 60.037, + "eval_steps_per_second": 1.881, + "step": 25950 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001870697502074221, + "loss": 1.3202, + "step": 25960 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001870590099052437, + "loss": 1.3437, + "step": 25980 + }, + { + "epoch": 1.39, + "learning_rate": 0.00018704826960306528, + "loss": 1.2984, + "step": 26000 + }, + { + "epoch": 1.39, + "eval_loss": 1.3933109045028687, + "eval_runtime": 49.9288, + "eval_samples_per_second": 60.086, + "eval_steps_per_second": 1.883, + "step": 26000 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001870375293008869, + "loss": 1.3126, + "step": 26020 + }, + { + "epoch": 1.39, + "learning_rate": 0.00018702678899870848, + "loss": 1.3076, + "step": 26040 + }, + { + "epoch": 1.39, + "eval_loss": 1.393373727798462, + "eval_runtime": 49.9405, + "eval_samples_per_second": 60.071, + "eval_steps_per_second": 1.882, + "step": 26050 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001870160486965301, + "loss": 1.3448, + "step": 26060 + }, + { + "epoch": 1.39, + "learning_rate": 0.00018700530839435168, + "loss": 1.31, + "step": 26080 + }, + { + "epoch": 1.39, + "learning_rate": 0.00018699456809217327, + "loss": 1.2794, + "step": 26100 + }, + { + "epoch": 1.39, + "eval_loss": 1.3935492038726807, + "eval_runtime": 49.9018, + "eval_samples_per_second": 60.118, + "eval_steps_per_second": 1.884, + "step": 26100 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018698382778999488, + "loss": 1.3262, + "step": 26120 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018697308748781647, + "loss": 1.3433, + "step": 26140 + }, + { + "epoch": 1.4, + "eval_loss": 1.393388032913208, + "eval_runtime": 49.9565, + "eval_samples_per_second": 60.052, + "eval_steps_per_second": 1.882, + "step": 26150 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018696234718563808, + "loss": 1.3313, + "step": 26160 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018695160688345967, + "loss": 1.3358, + "step": 26180 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018694086658128128, + "loss": 1.3343, + "step": 26200 + }, + { + "epoch": 1.4, + "eval_loss": 1.3940303325653076, + "eval_runtime": 49.921, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 26200 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018693012627910287, + "loss": 1.3278, + "step": 26220 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018691938597692446, + "loss": 1.3649, + "step": 26240 + }, + { + "epoch": 1.4, + "eval_loss": 1.3933554887771606, + "eval_runtime": 49.9527, + "eval_samples_per_second": 60.057, + "eval_steps_per_second": 1.882, + "step": 26250 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018690864567474607, + "loss": 1.3367, + "step": 26260 + }, + { + "epoch": 1.4, + "learning_rate": 0.00018689790537256766, + "loss": 1.3136, + "step": 26280 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018688716507038927, + "loss": 1.322, + "step": 26300 + }, + { + "epoch": 1.41, + "eval_loss": 1.393583059310913, + "eval_runtime": 49.897, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 26300 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018687642476821086, + "loss": 1.3264, + "step": 26320 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018686568446603244, + "loss": 1.3185, + "step": 26340 + }, + { + "epoch": 1.41, + "eval_loss": 1.3927003145217896, + "eval_runtime": 49.8674, + "eval_samples_per_second": 60.16, + "eval_steps_per_second": 1.885, + "step": 26350 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018685494416385406, + "loss": 1.3418, + "step": 26360 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018684420386167564, + "loss": 1.3065, + "step": 26380 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018683346355949726, + "loss": 1.3206, + "step": 26400 + }, + { + "epoch": 1.41, + "eval_loss": 1.3931444883346558, + "eval_runtime": 49.9132, + "eval_samples_per_second": 60.104, + "eval_steps_per_second": 1.883, + "step": 26400 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018682272325731884, + "loss": 1.306, + "step": 26420 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018681198295514043, + "loss": 1.3244, + "step": 26440 + }, + { + "epoch": 1.41, + "eval_loss": 1.3925822973251343, + "eval_runtime": 49.9477, + "eval_samples_per_second": 60.063, + "eval_steps_per_second": 1.882, + "step": 26450 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018680124265296207, + "loss": 1.3014, + "step": 26460 + }, + { + "epoch": 1.41, + "learning_rate": 0.00018679050235078366, + "loss": 1.3444, + "step": 26480 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018677976204860524, + "loss": 1.2831, + "step": 26500 + }, + { + "epoch": 1.42, + "eval_loss": 1.3929316997528076, + "eval_runtime": 49.8972, + "eval_samples_per_second": 60.124, + "eval_steps_per_second": 1.884, + "step": 26500 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018676902174642686, + "loss": 1.3343, + "step": 26520 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018675828144424844, + "loss": 1.3284, + "step": 26540 + }, + { + "epoch": 1.42, + "eval_loss": 1.392746090888977, + "eval_runtime": 49.9365, + "eval_samples_per_second": 60.076, + "eval_steps_per_second": 1.882, + "step": 26550 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018674754114207006, + "loss": 1.2949, + "step": 26560 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018673680083989164, + "loss": 1.2962, + "step": 26580 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018672606053771323, + "loss": 1.3729, + "step": 26600 + }, + { + "epoch": 1.42, + "eval_loss": 1.393019676208496, + "eval_runtime": 49.8964, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 26600 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018671532023553484, + "loss": 1.334, + "step": 26620 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018670457993335643, + "loss": 1.2929, + "step": 26640 + }, + { + "epoch": 1.42, + "eval_loss": 1.3938348293304443, + "eval_runtime": 49.9577, + "eval_samples_per_second": 60.051, + "eval_steps_per_second": 1.882, + "step": 26650 + }, + { + "epoch": 1.42, + "learning_rate": 0.00018669383963117804, + "loss": 1.3346, + "step": 26660 + }, + { + "epoch": 1.43, + "learning_rate": 0.00018668309932899963, + "loss": 1.3547, + "step": 26680 + }, + { + "epoch": 1.43, + "learning_rate": 0.00018667235902682124, + "loss": 1.3088, + "step": 26700 + }, + { + "epoch": 1.43, + "eval_loss": 1.3936799764633179, + "eval_runtime": 49.8956, + "eval_samples_per_second": 60.126, + "eval_steps_per_second": 1.884, + "step": 26700 + }, + { + "epoch": 1.43, + "learning_rate": 0.00018666161872464283, + "loss": 1.3428, + "step": 26720 + }, + { + "epoch": 1.43, + "learning_rate": 0.00018665087842246442, + "loss": 1.3654, + "step": 26740 + }, + { + "epoch": 1.43, + "eval_loss": 1.3933039903640747, + "eval_runtime": 49.9078, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 26750 + }, + { + "epoch": 1.43, + "learning_rate": 0.00018664013812028603, + "loss": 1.3661, + "step": 26760 + }, + { + "epoch": 1.43, + "learning_rate": 0.00018662939781810762, + "loss": 1.3255, + "step": 26780 + }, + { + "epoch": 1.43, + "learning_rate": 0.00018661865751592923, + "loss": 1.3279, + "step": 26800 + }, + { + "epoch": 1.43, + "eval_loss": 1.3933026790618896, + "eval_runtime": 49.8749, + "eval_samples_per_second": 60.15, + "eval_steps_per_second": 1.885, + "step": 26800 + }, + { + "epoch": 1.43, + "learning_rate": 0.00018660791721375082, + "loss": 1.3453, + "step": 26820 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001865971769115724, + "loss": 1.3479, + "step": 26840 + }, + { + "epoch": 1.43, + "eval_loss": 1.3933632373809814, + "eval_runtime": 49.9185, + "eval_samples_per_second": 60.098, + "eval_steps_per_second": 1.883, + "step": 26850 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018658643660939402, + "loss": 1.348, + "step": 26860 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001865756963072156, + "loss": 1.2823, + "step": 26880 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018656495600503722, + "loss": 1.3635, + "step": 26900 + }, + { + "epoch": 1.44, + "eval_loss": 1.39280366897583, + "eval_runtime": 49.9035, + "eval_samples_per_second": 60.116, + "eval_steps_per_second": 1.884, + "step": 26900 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001865542157028588, + "loss": 1.3419, + "step": 26920 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001865434754006804, + "loss": 1.3187, + "step": 26940 + }, + { + "epoch": 1.44, + "eval_loss": 1.3931427001953125, + "eval_runtime": 49.9404, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 26950 + }, + { + "epoch": 1.44, + "learning_rate": 0.000186532735098502, + "loss": 1.3267, + "step": 26960 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001865219947963236, + "loss": 1.2934, + "step": 26980 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001865112544941452, + "loss": 1.3212, + "step": 27000 + }, + { + "epoch": 1.44, + "eval_loss": 1.3930130004882812, + "eval_runtime": 49.8854, + "eval_samples_per_second": 60.138, + "eval_steps_per_second": 1.884, + "step": 27000 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001865005141919668, + "loss": 1.3249, + "step": 27020 + }, + { + "epoch": 1.44, + "learning_rate": 0.00018648977388978838, + "loss": 1.3197, + "step": 27040 + }, + { + "epoch": 1.45, + "eval_loss": 1.392832636833191, + "eval_runtime": 49.9646, + "eval_samples_per_second": 60.043, + "eval_steps_per_second": 1.881, + "step": 27050 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018647903358761, + "loss": 1.3284, + "step": 27060 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001864682932854316, + "loss": 1.2691, + "step": 27080 + }, + { + "epoch": 1.45, + "learning_rate": 0.00018645755298325322, + "loss": 1.3149, + "step": 27100 + }, + { + "epoch": 1.45, + "eval_loss": 1.3940879106521606, + "eval_runtime": 49.9065, + "eval_samples_per_second": 60.112, + "eval_steps_per_second": 1.884, + "step": 27100 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001864468126810748, + "loss": 1.3257, + "step": 27120 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001864360723788964, + "loss": 1.3285, + "step": 27140 + }, + { + "epoch": 1.45, + "eval_loss": 1.3934521675109863, + "eval_runtime": 49.9347, + "eval_samples_per_second": 60.078, + "eval_steps_per_second": 1.882, + "step": 27150 + }, + { + "epoch": 1.45, + "learning_rate": 0.000186425332076718, + "loss": 1.2978, + "step": 27160 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001864145917745396, + "loss": 1.3298, + "step": 27180 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001864038514723612, + "loss": 1.3334, + "step": 27200 + }, + { + "epoch": 1.45, + "eval_loss": 1.3929413557052612, + "eval_runtime": 49.9409, + "eval_samples_per_second": 60.071, + "eval_steps_per_second": 1.882, + "step": 27200 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001863931111701828, + "loss": 1.3099, + "step": 27220 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018638237086800438, + "loss": 1.3228, + "step": 27240 + }, + { + "epoch": 1.46, + "eval_loss": 1.393315076828003, + "eval_runtime": 49.8386, + "eval_samples_per_second": 60.194, + "eval_steps_per_second": 1.886, + "step": 27250 + }, + { + "epoch": 1.46, + "learning_rate": 0.000186371630565826, + "loss": 1.3152, + "step": 27260 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018636089026364758, + "loss": 1.3366, + "step": 27280 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001863501499614692, + "loss": 1.3362, + "step": 27300 + }, + { + "epoch": 1.46, + "eval_loss": 1.3930094242095947, + "eval_runtime": 49.8861, + "eval_samples_per_second": 60.137, + "eval_steps_per_second": 1.884, + "step": 27300 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018633940965929078, + "loss": 1.2964, + "step": 27320 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018632866935711236, + "loss": 1.3114, + "step": 27340 + }, + { + "epoch": 1.46, + "eval_loss": 1.3929059505462646, + "eval_runtime": 49.9407, + "eval_samples_per_second": 60.071, + "eval_steps_per_second": 1.882, + "step": 27350 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018631792905493398, + "loss": 1.2916, + "step": 27360 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018630718875275556, + "loss": 1.3073, + "step": 27380 + }, + { + "epoch": 1.46, + "learning_rate": 0.00018629644845057718, + "loss": 1.3526, + "step": 27400 + }, + { + "epoch": 1.46, + "eval_loss": 1.3936972618103027, + "eval_runtime": 49.9004, + "eval_samples_per_second": 60.12, + "eval_steps_per_second": 1.884, + "step": 27400 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018628570814839876, + "loss": 1.3262, + "step": 27420 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018627496784622035, + "loss": 1.3273, + "step": 27440 + }, + { + "epoch": 1.47, + "eval_loss": 1.3936001062393188, + "eval_runtime": 49.94, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 27450 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018626422754404196, + "loss": 1.3579, + "step": 27460 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018625348724186355, + "loss": 1.3286, + "step": 27480 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018624274693968516, + "loss": 1.2916, + "step": 27500 + }, + { + "epoch": 1.47, + "eval_loss": 1.3938214778900146, + "eval_runtime": 49.8847, + "eval_samples_per_second": 60.139, + "eval_steps_per_second": 1.884, + "step": 27500 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018623200663750675, + "loss": 1.3314, + "step": 27520 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018622126633532834, + "loss": 1.341, + "step": 27540 + }, + { + "epoch": 1.47, + "eval_loss": 1.3936923742294312, + "eval_runtime": 49.9482, + "eval_samples_per_second": 60.062, + "eval_steps_per_second": 1.882, + "step": 27550 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018621052603314995, + "loss": 1.3155, + "step": 27560 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018619978573097154, + "loss": 1.345, + "step": 27580 + }, + { + "epoch": 1.47, + "learning_rate": 0.00018618904542879315, + "loss": 1.3249, + "step": 27600 + }, + { + "epoch": 1.47, + "eval_loss": 1.3934848308563232, + "eval_runtime": 49.8694, + "eval_samples_per_second": 60.157, + "eval_steps_per_second": 1.885, + "step": 27600 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018617830512661474, + "loss": 1.3474, + "step": 27620 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018616756482443635, + "loss": 1.3055, + "step": 27640 + }, + { + "epoch": 1.48, + "eval_loss": 1.3937422037124634, + "eval_runtime": 49.9232, + "eval_samples_per_second": 60.092, + "eval_steps_per_second": 1.883, + "step": 27650 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018615682452225794, + "loss": 1.3026, + "step": 27660 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018614608422007952, + "loss": 1.3693, + "step": 27680 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018613534391790114, + "loss": 1.3237, + "step": 27700 + }, + { + "epoch": 1.48, + "eval_loss": 1.3935511112213135, + "eval_runtime": 49.9057, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 27700 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018612460361572275, + "loss": 1.3571, + "step": 27720 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018611386331354434, + "loss": 1.363, + "step": 27740 + }, + { + "epoch": 1.48, + "eval_loss": 1.3939462900161743, + "eval_runtime": 49.9303, + "eval_samples_per_second": 60.084, + "eval_steps_per_second": 1.883, + "step": 27750 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018610312301136595, + "loss": 1.3343, + "step": 27760 + }, + { + "epoch": 1.48, + "learning_rate": 0.00018609238270918754, + "loss": 1.3462, + "step": 27780 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018608164240700915, + "loss": 1.2996, + "step": 27800 + }, + { + "epoch": 1.49, + "eval_loss": 1.393791675567627, + "eval_runtime": 49.8552, + "eval_samples_per_second": 60.174, + "eval_steps_per_second": 1.885, + "step": 27800 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018607090210483074, + "loss": 1.3455, + "step": 27820 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018606016180265232, + "loss": 1.3544, + "step": 27840 + }, + { + "epoch": 1.49, + "eval_loss": 1.3946839570999146, + "eval_runtime": 49.9147, + "eval_samples_per_second": 60.103, + "eval_steps_per_second": 1.883, + "step": 27850 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018604942150047394, + "loss": 1.3101, + "step": 27860 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018603868119829552, + "loss": 1.3247, + "step": 27880 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018602794089611714, + "loss": 1.3394, + "step": 27900 + }, + { + "epoch": 1.49, + "eval_loss": 1.3933064937591553, + "eval_runtime": 49.8942, + "eval_samples_per_second": 60.127, + "eval_steps_per_second": 1.884, + "step": 27900 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018601720059393872, + "loss": 1.3662, + "step": 27920 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001860064602917603, + "loss": 1.2861, + "step": 27940 + }, + { + "epoch": 1.49, + "eval_loss": 1.3942046165466309, + "eval_runtime": 49.9129, + "eval_samples_per_second": 60.105, + "eval_steps_per_second": 1.883, + "step": 27950 + }, + { + "epoch": 1.49, + "learning_rate": 0.00018599571998958192, + "loss": 1.3252, + "step": 27960 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001859849796874035, + "loss": 1.2789, + "step": 27980 + }, + { + "epoch": 1.5, + "learning_rate": 0.00018597423938522512, + "loss": 1.31, + "step": 28000 + }, + { + "epoch": 1.5, + "eval_loss": 1.3933945894241333, + "eval_runtime": 49.8928, + "eval_samples_per_second": 60.129, + "eval_steps_per_second": 1.884, + "step": 28000 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001859634990830467, + "loss": 1.3311, + "step": 28020 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001859527587808683, + "loss": 1.348, + "step": 28040 + }, + { + "epoch": 1.5, + "eval_loss": 1.3926706314086914, + "eval_runtime": 49.9283, + "eval_samples_per_second": 60.086, + "eval_steps_per_second": 1.883, + "step": 28050 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001859420184786899, + "loss": 1.3212, + "step": 28060 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001859312781765115, + "loss": 1.3279, + "step": 28080 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001859205378743331, + "loss": 1.3175, + "step": 28100 + }, + { + "epoch": 1.5, + "eval_loss": 1.3925825357437134, + "eval_runtime": 49.9118, + "eval_samples_per_second": 60.106, + "eval_steps_per_second": 1.883, + "step": 28100 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001859097975721547, + "loss": 1.3251, + "step": 28120 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001858990572699763, + "loss": 1.3301, + "step": 28140 + }, + { + "epoch": 1.5, + "eval_loss": 1.3925762176513672, + "eval_runtime": 49.9246, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 28150 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001858883169677979, + "loss": 1.313, + "step": 28160 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018587757666561948, + "loss": 1.3034, + "step": 28180 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001858668363634411, + "loss": 1.339, + "step": 28200 + }, + { + "epoch": 1.51, + "eval_loss": 1.392657995223999, + "eval_runtime": 49.8943, + "eval_samples_per_second": 60.127, + "eval_steps_per_second": 1.884, + "step": 28200 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018585609606126268, + "loss": 1.3148, + "step": 28220 + }, + { + "epoch": 1.51, + "learning_rate": 0.0001858453557590843, + "loss": 1.3727, + "step": 28240 + }, + { + "epoch": 1.51, + "eval_loss": 1.3924033641815186, + "eval_runtime": 49.919, + "eval_samples_per_second": 60.097, + "eval_steps_per_second": 1.883, + "step": 28250 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018583461545690588, + "loss": 1.3346, + "step": 28260 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018582387515472747, + "loss": 1.3133, + "step": 28280 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018581313485254908, + "loss": 1.3107, + "step": 28300 + }, + { + "epoch": 1.51, + "eval_loss": 1.3919031620025635, + "eval_runtime": 49.9062, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 28300 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018580239455037067, + "loss": 1.2881, + "step": 28320 + }, + { + "epoch": 1.51, + "learning_rate": 0.00018579165424819228, + "loss": 1.3474, + "step": 28340 + }, + { + "epoch": 1.51, + "eval_loss": 1.392065405845642, + "eval_runtime": 49.9338, + "eval_samples_per_second": 60.08, + "eval_steps_per_second": 1.882, + "step": 28350 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001857809139460139, + "loss": 1.3724, + "step": 28360 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018577017364383548, + "loss": 1.3168, + "step": 28380 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001857594333416571, + "loss": 1.3254, + "step": 28400 + }, + { + "epoch": 1.52, + "eval_loss": 1.392147421836853, + "eval_runtime": 49.8898, + "eval_samples_per_second": 60.132, + "eval_steps_per_second": 1.884, + "step": 28400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018574869303947868, + "loss": 1.3542, + "step": 28420 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018573795273730027, + "loss": 1.3569, + "step": 28440 + }, + { + "epoch": 1.52, + "eval_loss": 1.3924622535705566, + "eval_runtime": 49.9258, + "eval_samples_per_second": 60.089, + "eval_steps_per_second": 1.883, + "step": 28450 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018572721243512188, + "loss": 1.3139, + "step": 28460 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018571647213294347, + "loss": 1.3265, + "step": 28480 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018570573183076508, + "loss": 1.3044, + "step": 28500 + }, + { + "epoch": 1.52, + "eval_loss": 1.3926573991775513, + "eval_runtime": 49.9088, + "eval_samples_per_second": 60.11, + "eval_steps_per_second": 1.883, + "step": 28500 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018569499152858667, + "loss": 1.3366, + "step": 28520 + }, + { + "epoch": 1.52, + "learning_rate": 0.00018568425122640828, + "loss": 1.3042, + "step": 28540 + }, + { + "epoch": 1.53, + "eval_loss": 1.3924212455749512, + "eval_runtime": 49.9477, + "eval_samples_per_second": 60.063, + "eval_steps_per_second": 1.882, + "step": 28550 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018567351092422987, + "loss": 1.3323, + "step": 28560 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018566277062205145, + "loss": 1.3481, + "step": 28580 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018565203031987307, + "loss": 1.3516, + "step": 28600 + }, + { + "epoch": 1.53, + "eval_loss": 1.392707347869873, + "eval_runtime": 49.881, + "eval_samples_per_second": 60.143, + "eval_steps_per_second": 1.884, + "step": 28600 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018564129001769465, + "loss": 1.3056, + "step": 28620 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018563054971551627, + "loss": 1.3222, + "step": 28640 + }, + { + "epoch": 1.53, + "eval_loss": 1.3928587436676025, + "eval_runtime": 49.9368, + "eval_samples_per_second": 60.076, + "eval_steps_per_second": 1.882, + "step": 28650 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018561980941333785, + "loss": 1.3017, + "step": 28660 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018560906911115944, + "loss": 1.3136, + "step": 28680 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018559832880898105, + "loss": 1.3092, + "step": 28700 + }, + { + "epoch": 1.53, + "eval_loss": 1.39238440990448, + "eval_runtime": 49.9003, + "eval_samples_per_second": 60.12, + "eval_steps_per_second": 1.884, + "step": 28700 + }, + { + "epoch": 1.53, + "learning_rate": 0.00018558758850680264, + "loss": 1.2594, + "step": 28720 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018557684820462425, + "loss": 1.3427, + "step": 28740 + }, + { + "epoch": 1.54, + "eval_loss": 1.3922100067138672, + "eval_runtime": 49.9586, + "eval_samples_per_second": 60.05, + "eval_steps_per_second": 1.882, + "step": 28750 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018556610790244584, + "loss": 1.3591, + "step": 28760 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018555536760026743, + "loss": 1.3285, + "step": 28780 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018554462729808904, + "loss": 1.2918, + "step": 28800 + }, + { + "epoch": 1.54, + "eval_loss": 1.3918845653533936, + "eval_runtime": 49.8909, + "eval_samples_per_second": 60.131, + "eval_steps_per_second": 1.884, + "step": 28800 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018553388699591063, + "loss": 1.2951, + "step": 28820 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018552314669373224, + "loss": 1.3428, + "step": 28840 + }, + { + "epoch": 1.54, + "eval_loss": 1.3924897909164429, + "eval_runtime": 49.9365, + "eval_samples_per_second": 60.076, + "eval_steps_per_second": 1.882, + "step": 28850 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018551240639155383, + "loss": 1.3393, + "step": 28860 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001855016660893754, + "loss": 1.3361, + "step": 28880 + }, + { + "epoch": 1.54, + "learning_rate": 0.00018549092578719703, + "loss": 1.3527, + "step": 28900 + }, + { + "epoch": 1.54, + "eval_loss": 1.3922792673110962, + "eval_runtime": 49.9247, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 28900 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001854801854850186, + "loss": 1.3291, + "step": 28920 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018546944518284023, + "loss": 1.3284, + "step": 28940 + }, + { + "epoch": 1.55, + "eval_loss": 1.392605185508728, + "eval_runtime": 49.9456, + "eval_samples_per_second": 60.065, + "eval_steps_per_second": 1.882, + "step": 28950 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001854587048806618, + "loss": 1.3219, + "step": 28960 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018544796457848343, + "loss": 1.31, + "step": 28980 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018543722427630504, + "loss": 1.3238, + "step": 29000 + }, + { + "epoch": 1.55, + "eval_loss": 1.3933876752853394, + "eval_runtime": 49.9144, + "eval_samples_per_second": 60.103, + "eval_steps_per_second": 1.883, + "step": 29000 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018542648397412663, + "loss": 1.3252, + "step": 29020 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018541574367194824, + "loss": 1.2978, + "step": 29040 + }, + { + "epoch": 1.55, + "eval_loss": 1.393264651298523, + "eval_runtime": 49.9314, + "eval_samples_per_second": 60.082, + "eval_steps_per_second": 1.883, + "step": 29050 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018540500336976983, + "loss": 1.3229, + "step": 29060 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001853942630675914, + "loss": 1.3471, + "step": 29080 + }, + { + "epoch": 1.55, + "learning_rate": 0.00018538352276541303, + "loss": 1.2733, + "step": 29100 + }, + { + "epoch": 1.55, + "eval_loss": 1.3934080600738525, + "eval_runtime": 49.9242, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 29100 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001853727824632346, + "loss": 1.3456, + "step": 29120 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018536204216105623, + "loss": 1.3596, + "step": 29140 + }, + { + "epoch": 1.56, + "eval_loss": 1.3926467895507812, + "eval_runtime": 49.9137, + "eval_samples_per_second": 60.104, + "eval_steps_per_second": 1.883, + "step": 29150 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001853513018588778, + "loss": 1.305, + "step": 29160 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001853405615566994, + "loss": 1.3292, + "step": 29180 + }, + { + "epoch": 1.56, + "learning_rate": 0.000185329821254521, + "loss": 1.2943, + "step": 29200 + }, + { + "epoch": 1.56, + "eval_loss": 1.3923407793045044, + "eval_runtime": 49.9073, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 29200 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001853190809523426, + "loss": 1.3187, + "step": 29220 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001853083406501642, + "loss": 1.299, + "step": 29240 + }, + { + "epoch": 1.56, + "eval_loss": 1.3922613859176636, + "eval_runtime": 49.9241, + "eval_samples_per_second": 60.091, + "eval_steps_per_second": 1.883, + "step": 29250 + }, + { + "epoch": 1.56, + "learning_rate": 0.0001852976003479858, + "loss": 1.2977, + "step": 29260 + }, + { + "epoch": 1.56, + "learning_rate": 0.00018528686004580739, + "loss": 1.3288, + "step": 29280 + }, + { + "epoch": 1.57, + "learning_rate": 0.000185276119743629, + "loss": 1.3166, + "step": 29300 + }, + { + "epoch": 1.57, + "eval_loss": 1.3935879468917847, + "eval_runtime": 49.9027, + "eval_samples_per_second": 60.117, + "eval_steps_per_second": 1.884, + "step": 29300 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018526537944145059, + "loss": 1.3237, + "step": 29320 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001852546391392722, + "loss": 1.3144, + "step": 29340 + }, + { + "epoch": 1.57, + "eval_loss": 1.3930631875991821, + "eval_runtime": 49.9398, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 29350 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018524389883709379, + "loss": 1.3102, + "step": 29360 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018523315853491537, + "loss": 1.335, + "step": 29380 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018522241823273699, + "loss": 1.3373, + "step": 29400 + }, + { + "epoch": 1.57, + "eval_loss": 1.3933336734771729, + "eval_runtime": 49.9162, + "eval_samples_per_second": 60.101, + "eval_steps_per_second": 1.883, + "step": 29400 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018521167793055857, + "loss": 1.332, + "step": 29420 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018520093762838019, + "loss": 1.3363, + "step": 29440 + }, + { + "epoch": 1.57, + "eval_loss": 1.3931363821029663, + "eval_runtime": 49.9165, + "eval_samples_per_second": 60.1, + "eval_steps_per_second": 1.883, + "step": 29450 + }, + { + "epoch": 1.57, + "learning_rate": 0.00018519019732620177, + "loss": 1.3197, + "step": 29460 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018517945702402339, + "loss": 1.2887, + "step": 29480 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018516871672184497, + "loss": 1.3117, + "step": 29500 + }, + { + "epoch": 1.58, + "eval_loss": 1.3927063941955566, + "eval_runtime": 49.9535, + "eval_samples_per_second": 60.056, + "eval_steps_per_second": 1.882, + "step": 29500 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018515797641966656, + "loss": 1.3437, + "step": 29520 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018514723611748817, + "loss": 1.3171, + "step": 29540 + }, + { + "epoch": 1.58, + "eval_loss": 1.392856478691101, + "eval_runtime": 49.9414, + "eval_samples_per_second": 60.07, + "eval_steps_per_second": 1.882, + "step": 29550 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018513649581530976, + "loss": 1.2841, + "step": 29560 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018512575551313137, + "loss": 1.3138, + "step": 29580 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018511501521095299, + "loss": 1.3238, + "step": 29600 + }, + { + "epoch": 1.58, + "eval_loss": 1.3924871683120728, + "eval_runtime": 49.8958, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 29600 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018510427490877457, + "loss": 1.3225, + "step": 29620 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018509353460659619, + "loss": 1.3532, + "step": 29640 + }, + { + "epoch": 1.58, + "eval_loss": 1.3923962116241455, + "eval_runtime": 49.9475, + "eval_samples_per_second": 60.063, + "eval_steps_per_second": 1.882, + "step": 29650 + }, + { + "epoch": 1.58, + "learning_rate": 0.00018508279430441777, + "loss": 1.311, + "step": 29660 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018507205400223936, + "loss": 1.3255, + "step": 29680 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018506131370006097, + "loss": 1.2946, + "step": 29700 + }, + { + "epoch": 1.59, + "eval_loss": 1.3928083181381226, + "eval_runtime": 49.938, + "eval_samples_per_second": 60.075, + "eval_steps_per_second": 1.882, + "step": 29700 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018505111041299148, + "loss": 1.3049, + "step": 29720 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018504037011081307, + "loss": 1.3325, + "step": 29740 + }, + { + "epoch": 1.59, + "eval_loss": 1.3928927183151245, + "eval_runtime": 49.9328, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 29750 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018502962980863468, + "loss": 1.2984, + "step": 29760 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018501888950645627, + "loss": 1.3365, + "step": 29780 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018500814920427788, + "loss": 1.3203, + "step": 29800 + }, + { + "epoch": 1.59, + "eval_loss": 1.3930357694625854, + "eval_runtime": 49.8837, + "eval_samples_per_second": 60.14, + "eval_steps_per_second": 1.884, + "step": 29800 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018499740890209947, + "loss": 1.3326, + "step": 29820 + }, + { + "epoch": 1.59, + "learning_rate": 0.00018498666859992105, + "loss": 1.3452, + "step": 29840 + }, + { + "epoch": 1.59, + "eval_loss": 1.3932480812072754, + "eval_runtime": 49.9401, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 29850 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018497592829774267, + "loss": 1.3253, + "step": 29860 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018496518799556425, + "loss": 1.3311, + "step": 29880 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018495444769338587, + "loss": 1.3038, + "step": 29900 + }, + { + "epoch": 1.6, + "eval_loss": 1.393734097480774, + "eval_runtime": 49.8662, + "eval_samples_per_second": 60.161, + "eval_steps_per_second": 1.885, + "step": 29900 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018494370739120745, + "loss": 1.2999, + "step": 29920 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018493296708902904, + "loss": 1.3208, + "step": 29940 + }, + { + "epoch": 1.6, + "eval_loss": 1.3928980827331543, + "eval_runtime": 49.94, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 29950 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018492222678685065, + "loss": 1.3317, + "step": 29960 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018491148648467224, + "loss": 1.3216, + "step": 29980 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018490074618249385, + "loss": 1.3013, + "step": 30000 + }, + { + "epoch": 1.6, + "eval_loss": 1.3931738138198853, + "eval_runtime": 49.8997, + "eval_samples_per_second": 60.121, + "eval_steps_per_second": 1.884, + "step": 30000 + }, + { + "epoch": 1.6, + "learning_rate": 0.00018489000588031544, + "loss": 1.2852, + "step": 30020 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018487926557813703, + "loss": 1.2774, + "step": 30040 + }, + { + "epoch": 1.61, + "eval_loss": 1.3928226232528687, + "eval_runtime": 49.9408, + "eval_samples_per_second": 60.071, + "eval_steps_per_second": 1.882, + "step": 30050 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018486852527595864, + "loss": 1.2834, + "step": 30060 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018485778497378025, + "loss": 1.3385, + "step": 30080 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018484704467160187, + "loss": 1.3576, + "step": 30100 + }, + { + "epoch": 1.61, + "eval_loss": 1.3927522897720337, + "eval_runtime": 49.9392, + "eval_samples_per_second": 60.073, + "eval_steps_per_second": 1.882, + "step": 30100 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018483630436942345, + "loss": 1.3092, + "step": 30120 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018482556406724504, + "loss": 1.3149, + "step": 30140 + }, + { + "epoch": 1.61, + "eval_loss": 1.3941123485565186, + "eval_runtime": 49.9459, + "eval_samples_per_second": 60.065, + "eval_steps_per_second": 1.882, + "step": 30150 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018481482376506665, + "loss": 1.3414, + "step": 30160 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018480408346288824, + "loss": 1.3644, + "step": 30180 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018479334316070985, + "loss": 1.3482, + "step": 30200 + }, + { + "epoch": 1.61, + "eval_loss": 1.3934115171432495, + "eval_runtime": 49.9088, + "eval_samples_per_second": 60.11, + "eval_steps_per_second": 1.883, + "step": 30200 + }, + { + "epoch": 1.61, + "learning_rate": 0.00018478260285853144, + "loss": 1.3446, + "step": 30220 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018477186255635303, + "loss": 1.3208, + "step": 30240 + }, + { + "epoch": 1.62, + "eval_loss": 1.3933861255645752, + "eval_runtime": 49.9422, + "eval_samples_per_second": 60.069, + "eval_steps_per_second": 1.882, + "step": 30250 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018476112225417464, + "loss": 1.2915, + "step": 30260 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018475038195199623, + "loss": 1.2714, + "step": 30280 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018473964164981784, + "loss": 1.2847, + "step": 30300 + }, + { + "epoch": 1.62, + "eval_loss": 1.3934121131896973, + "eval_runtime": 49.906, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 30300 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018472890134763943, + "loss": 1.3169, + "step": 30320 + }, + { + "epoch": 1.62, + "learning_rate": 0.000184718161045461, + "loss": 1.3239, + "step": 30340 + }, + { + "epoch": 1.62, + "eval_loss": 1.3946083784103394, + "eval_runtime": 49.9402, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 30350 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018470742074328263, + "loss": 1.3185, + "step": 30360 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001846966804411042, + "loss": 1.3229, + "step": 30380 + }, + { + "epoch": 1.62, + "learning_rate": 0.00018468594013892583, + "loss": 1.2981, + "step": 30400 + }, + { + "epoch": 1.62, + "eval_loss": 1.3945001363754272, + "eval_runtime": 49.8812, + "eval_samples_per_second": 60.143, + "eval_steps_per_second": 1.884, + "step": 30400 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001846751998367474, + "loss": 1.3203, + "step": 30420 + }, + { + "epoch": 1.63, + "learning_rate": 0.000184664459534569, + "loss": 1.2967, + "step": 30440 + }, + { + "epoch": 1.63, + "eval_loss": 1.393681287765503, + "eval_runtime": 49.94, + "eval_samples_per_second": 60.072, + "eval_steps_per_second": 1.882, + "step": 30450 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001846537192323906, + "loss": 1.2815, + "step": 30460 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001846429789302122, + "loss": 1.3216, + "step": 30480 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001846322386280338, + "loss": 1.3504, + "step": 30500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3937861919403076, + "eval_runtime": 49.8911, + "eval_samples_per_second": 60.131, + "eval_steps_per_second": 1.884, + "step": 30500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001846214983258554, + "loss": 1.2858, + "step": 30520 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018461075802367699, + "loss": 1.322, + "step": 30540 + }, + { + "epoch": 1.63, + "eval_loss": 1.3939573764801025, + "eval_runtime": 49.8953, + "eval_samples_per_second": 60.126, + "eval_steps_per_second": 1.884, + "step": 30550 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001846000177214986, + "loss": 1.2909, + "step": 30560 + }, + { + "epoch": 1.63, + "learning_rate": 0.00018458927741932019, + "loss": 1.2957, + "step": 30580 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001845785371171418, + "loss": 1.3201, + "step": 30600 + }, + { + "epoch": 1.64, + "eval_loss": 1.3934041261672974, + "eval_runtime": 49.8947, + "eval_samples_per_second": 60.127, + "eval_steps_per_second": 1.884, + "step": 30600 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018456779681496339, + "loss": 1.2852, + "step": 30620 + }, + { + "epoch": 1.64, + "learning_rate": 0.000184557056512785, + "loss": 1.3183, + "step": 30640 + }, + { + "epoch": 1.64, + "eval_loss": 1.3938255310058594, + "eval_runtime": 49.9299, + "eval_samples_per_second": 60.084, + "eval_steps_per_second": 1.883, + "step": 30650 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018454631621060659, + "loss": 1.3321, + "step": 30660 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018453557590842817, + "loss": 1.321, + "step": 30680 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018452483560624979, + "loss": 1.304, + "step": 30700 + }, + { + "epoch": 1.64, + "eval_loss": 1.3930375576019287, + "eval_runtime": 49.9022, + "eval_samples_per_second": 60.118, + "eval_steps_per_second": 1.884, + "step": 30700 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001845140953040714, + "loss": 1.2811, + "step": 30720 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018450335500189299, + "loss": 1.3484, + "step": 30740 + }, + { + "epoch": 1.64, + "eval_loss": 1.3932937383651733, + "eval_runtime": 49.8794, + "eval_samples_per_second": 60.145, + "eval_steps_per_second": 1.885, + "step": 30750 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001844926146997146, + "loss": 1.3324, + "step": 30760 + }, + { + "epoch": 1.64, + "learning_rate": 0.00018448187439753619, + "loss": 1.3314, + "step": 30780 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001844711340953578, + "loss": 1.3306, + "step": 30800 + }, + { + "epoch": 1.65, + "eval_loss": 1.3924168348312378, + "eval_runtime": 49.8668, + "eval_samples_per_second": 60.16, + "eval_steps_per_second": 1.885, + "step": 30800 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018446039379317939, + "loss": 1.3495, + "step": 30820 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018444965349100097, + "loss": 1.2725, + "step": 30840 + }, + { + "epoch": 1.65, + "eval_loss": 1.3924477100372314, + "eval_runtime": 49.9112, + "eval_samples_per_second": 60.107, + "eval_steps_per_second": 1.883, + "step": 30850 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018443891318882259, + "loss": 1.3156, + "step": 30860 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018442817288664417, + "loss": 1.2995, + "step": 30880 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018441743258446579, + "loss": 1.3582, + "step": 30900 + }, + { + "epoch": 1.65, + "eval_loss": 1.3932455778121948, + "eval_runtime": 49.9019, + "eval_samples_per_second": 60.118, + "eval_steps_per_second": 1.884, + "step": 30900 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018440669228228737, + "loss": 1.3256, + "step": 30920 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018439595198010896, + "loss": 1.2901, + "step": 30940 + }, + { + "epoch": 1.65, + "eval_loss": 1.3927903175354004, + "eval_runtime": 49.9444, + "eval_samples_per_second": 60.067, + "eval_steps_per_second": 1.882, + "step": 30950 + }, + { + "epoch": 1.65, + "learning_rate": 0.00018438521167793057, + "loss": 1.3528, + "step": 30960 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018437447137575216, + "loss": 1.2902, + "step": 30980 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018436373107357377, + "loss": 1.3044, + "step": 31000 + }, + { + "epoch": 1.66, + "eval_loss": 1.3926588296890259, + "eval_runtime": 49.8979, + "eval_samples_per_second": 60.123, + "eval_steps_per_second": 1.884, + "step": 31000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018435299077139536, + "loss": 1.33, + "step": 31020 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018434225046921697, + "loss": 1.3354, + "step": 31040 + }, + { + "epoch": 1.66, + "eval_loss": 1.3930628299713135, + "eval_runtime": 49.9406, + "eval_samples_per_second": 60.071, + "eval_steps_per_second": 1.882, + "step": 31050 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018433151016703856, + "loss": 1.3041, + "step": 31060 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018432076986486015, + "loss": 1.3329, + "step": 31080 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018431002956268176, + "loss": 1.2903, + "step": 31100 + }, + { + "epoch": 1.66, + "eval_loss": 1.393136739730835, + "eval_runtime": 49.9088, + "eval_samples_per_second": 60.11, + "eval_steps_per_second": 1.883, + "step": 31100 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018429928926050335, + "loss": 1.3196, + "step": 31120 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018428854895832496, + "loss": 1.3596, + "step": 31140 + }, + { + "epoch": 1.66, + "eval_loss": 1.3931183815002441, + "eval_runtime": 49.9578, + "eval_samples_per_second": 60.051, + "eval_steps_per_second": 1.882, + "step": 31150 + }, + { + "epoch": 1.66, + "learning_rate": 0.00018427780865614655, + "loss": 1.2731, + "step": 31160 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018426706835396813, + "loss": 1.3034, + "step": 31180 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018425632805178975, + "loss": 1.3271, + "step": 31200 + }, + { + "epoch": 1.67, + "eval_loss": 1.3934111595153809, + "eval_runtime": 49.8674, + "eval_samples_per_second": 60.16, + "eval_steps_per_second": 1.885, + "step": 31200 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018424558774961133, + "loss": 1.2989, + "step": 31220 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018423484744743295, + "loss": 1.3109, + "step": 31240 + }, + { + "epoch": 1.67, + "eval_loss": 1.3935177326202393, + "eval_runtime": 49.9074, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 31250 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018422410714525453, + "loss": 1.3119, + "step": 31260 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018421336684307612, + "loss": 1.3452, + "step": 31280 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018420262654089773, + "loss": 1.3161, + "step": 31300 + }, + { + "epoch": 1.67, + "eval_loss": 1.3936994075775146, + "eval_runtime": 49.8829, + "eval_samples_per_second": 60.141, + "eval_steps_per_second": 1.884, + "step": 31300 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018419188623871932, + "loss": 1.3162, + "step": 31320 + }, + { + "epoch": 1.67, + "learning_rate": 0.00018418114593654093, + "loss": 1.3291, + "step": 31340 + }, + { + "epoch": 1.68, + "eval_loss": 1.3944883346557617, + "eval_runtime": 49.9276, + "eval_samples_per_second": 60.087, + "eval_steps_per_second": 1.883, + "step": 31350 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018417040563436255, + "loss": 1.3424, + "step": 31360 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018415966533218413, + "loss": 1.3338, + "step": 31380 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018414892503000575, + "loss": 1.337, + "step": 31400 + }, + { + "epoch": 1.68, + "eval_loss": 1.39397394657135, + "eval_runtime": 49.9112, + "eval_samples_per_second": 60.107, + "eval_steps_per_second": 1.883, + "step": 31400 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018413818472782733, + "loss": 1.3138, + "step": 31420 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018412744442564892, + "loss": 1.3364, + "step": 31440 + }, + { + "epoch": 1.68, + "eval_loss": 1.3931034803390503, + "eval_runtime": 49.952, + "eval_samples_per_second": 60.058, + "eval_steps_per_second": 1.882, + "step": 31450 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018411670412347053, + "loss": 1.2953, + "step": 31460 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018410596382129212, + "loss": 1.3017, + "step": 31480 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018409522351911373, + "loss": 1.349, + "step": 31500 + }, + { + "epoch": 1.68, + "eval_loss": 1.3928076028823853, + "eval_runtime": 49.926, + "eval_samples_per_second": 60.089, + "eval_steps_per_second": 1.883, + "step": 31500 + }, + { + "epoch": 1.68, + "learning_rate": 0.00018408448321693532, + "loss": 1.3271, + "step": 31520 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018407374291475693, + "loss": 1.3007, + "step": 31540 + }, + { + "epoch": 1.69, + "eval_loss": 1.3933926820755005, + "eval_runtime": 49.9339, + "eval_samples_per_second": 60.079, + "eval_steps_per_second": 1.882, + "step": 31550 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018406300261257852, + "loss": 1.2924, + "step": 31560 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001840522623104001, + "loss": 1.3428, + "step": 31580 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018404152200822172, + "loss": 1.283, + "step": 31600 + }, + { + "epoch": 1.69, + "eval_loss": 1.3929836750030518, + "eval_runtime": 49.882, + "eval_samples_per_second": 60.142, + "eval_steps_per_second": 1.884, + "step": 31600 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001840307817060433, + "loss": 1.256, + "step": 31620 + }, + { + "epoch": 1.69, + "learning_rate": 0.00018402004140386492, + "loss": 1.3488, + "step": 31640 + }, + { + "epoch": 1.69, + "eval_loss": 1.3922340869903564, + "eval_runtime": 49.9187, + "eval_samples_per_second": 60.098, + "eval_steps_per_second": 1.883, + "step": 31650 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001840093011016865, + "loss": 1.3199, + "step": 31660 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001839985607995081, + "loss": 1.3469, + "step": 31680 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001839878204973297, + "loss": 1.3593, + "step": 31700 + }, + { + "epoch": 1.69, + "eval_loss": 1.3929144144058228, + "eval_runtime": 49.8826, + "eval_samples_per_second": 60.141, + "eval_steps_per_second": 1.884, + "step": 31700 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001839770801951513, + "loss": 1.3439, + "step": 31720 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001839668769080818, + "loss": 1.2887, + "step": 31740 + }, + { + "epoch": 1.7, + "eval_loss": 1.3930996656417847, + "eval_runtime": 49.9311, + "eval_samples_per_second": 60.083, + "eval_steps_per_second": 1.883, + "step": 31750 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001839561366059034, + "loss": 1.2978, + "step": 31760 + }, + { + "epoch": 1.7, + "learning_rate": 0.000183945396303725, + "loss": 1.3095, + "step": 31780 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001839346560015466, + "loss": 1.2788, + "step": 31800 + }, + { + "epoch": 1.7, + "eval_loss": 1.3926992416381836, + "eval_runtime": 49.8745, + "eval_samples_per_second": 60.151, + "eval_steps_per_second": 1.885, + "step": 31800 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018392391569936823, + "loss": 1.3313, + "step": 31820 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001839131753971898, + "loss": 1.3414, + "step": 31840 + }, + { + "epoch": 1.7, + "eval_loss": 1.392523169517517, + "eval_runtime": 49.9178, + "eval_samples_per_second": 60.099, + "eval_steps_per_second": 1.883, + "step": 31850 + }, + { + "epoch": 1.7, + "learning_rate": 0.00018390243509501143, + "loss": 1.339, + "step": 31860 + }, + { + "epoch": 1.7, + "learning_rate": 0.000183891694792833, + "loss": 1.3102, + "step": 31880 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001838809544906546, + "loss": 1.3429, + "step": 31900 + }, + { + "epoch": 1.7, + "eval_loss": 1.3921146392822266, + "eval_runtime": 49.9058, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 31900 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001838702141884762, + "loss": 1.2851, + "step": 31920 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018386001090140672, + "loss": 1.3249, + "step": 31940 + }, + { + "epoch": 1.71, + "eval_loss": 1.3931465148925781, + "eval_runtime": 49.9509, + "eval_samples_per_second": 60.059, + "eval_steps_per_second": 1.882, + "step": 31950 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001838492705992283, + "loss": 1.3222, + "step": 31960 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018383853029704992, + "loss": 1.3342, + "step": 31980 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001838277899948715, + "loss": 1.3299, + "step": 32000 + }, + { + "epoch": 1.71, + "eval_loss": 1.3919731378555298, + "eval_runtime": 49.8835, + "eval_samples_per_second": 60.14, + "eval_steps_per_second": 1.884, + "step": 32000 + }, + { + "epoch": 1.71, + "learning_rate": 0.00018381704969269312, + "loss": 1.3588, + "step": 32020 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001838063093905147, + "loss": 1.3282, + "step": 32040 + }, + { + "epoch": 1.71, + "eval_loss": 1.3922117948532104, + "eval_runtime": 49.9432, + "eval_samples_per_second": 60.068, + "eval_steps_per_second": 1.882, + "step": 32050 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001837955690883363, + "loss": 1.3059, + "step": 32060 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001837848287861579, + "loss": 1.3546, + "step": 32080 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001837740884839795, + "loss": 1.3122, + "step": 32100 + }, + { + "epoch": 1.72, + "eval_loss": 1.3919049501419067, + "eval_runtime": 49.8831, + "eval_samples_per_second": 60.141, + "eval_steps_per_second": 1.884, + "step": 32100 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001837633481818011, + "loss": 1.3176, + "step": 32120 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001837526078796227, + "loss": 1.3534, + "step": 32140 + }, + { + "epoch": 1.72, + "eval_loss": 1.3918403387069702, + "eval_runtime": 49.906, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 32150 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018374186757744428, + "loss": 1.3272, + "step": 32160 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001837311272752659, + "loss": 1.3243, + "step": 32180 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018372038697308748, + "loss": 1.3227, + "step": 32200 + }, + { + "epoch": 1.72, + "eval_loss": 1.392651081085205, + "eval_runtime": 49.8869, + "eval_samples_per_second": 60.136, + "eval_steps_per_second": 1.884, + "step": 32200 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001837096466709091, + "loss": 1.2965, + "step": 32220 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018369890636873068, + "loss": 1.2821, + "step": 32240 + }, + { + "epoch": 1.72, + "eval_loss": 1.393078088760376, + "eval_runtime": 49.917, + "eval_samples_per_second": 60.1, + "eval_steps_per_second": 1.883, + "step": 32250 + }, + { + "epoch": 1.72, + "learning_rate": 0.00018368816606655227, + "loss": 1.3525, + "step": 32260 + }, + { + "epoch": 1.72, + "learning_rate": 0.0001836774257643739, + "loss": 1.3203, + "step": 32280 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001836666854621955, + "loss": 1.2915, + "step": 32300 + }, + { + "epoch": 1.73, + "eval_loss": 1.392730951309204, + "eval_runtime": 49.9003, + "eval_samples_per_second": 60.12, + "eval_steps_per_second": 1.884, + "step": 32300 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001836559451600171, + "loss": 1.2898, + "step": 32320 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001836452048578387, + "loss": 1.3075, + "step": 32340 + }, + { + "epoch": 1.73, + "eval_loss": 1.3931732177734375, + "eval_runtime": 49.939, + "eval_samples_per_second": 60.073, + "eval_steps_per_second": 1.882, + "step": 32350 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018363446455566028, + "loss": 1.3341, + "step": 32360 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001836237242534819, + "loss": 1.2627, + "step": 32380 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018361298395130348, + "loss": 1.3147, + "step": 32400 + }, + { + "epoch": 1.73, + "eval_loss": 1.3935468196868896, + "eval_runtime": 49.9064, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 32400 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001836022436491251, + "loss": 1.3023, + "step": 32420 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018359150334694668, + "loss": 1.3501, + "step": 32440 + }, + { + "epoch": 1.73, + "eval_loss": 1.394555687904358, + "eval_runtime": 49.9489, + "eval_samples_per_second": 60.061, + "eval_steps_per_second": 1.882, + "step": 32450 + }, + { + "epoch": 1.73, + "learning_rate": 0.00018358076304476827, + "loss": 1.3262, + "step": 32460 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018357002274258988, + "loss": 1.3019, + "step": 32480 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018355928244041147, + "loss": 1.3435, + "step": 32500 + }, + { + "epoch": 1.74, + "eval_loss": 1.393541693687439, + "eval_runtime": 49.8892, + "eval_samples_per_second": 60.133, + "eval_steps_per_second": 1.884, + "step": 32500 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018354854213823308, + "loss": 1.3309, + "step": 32520 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018353780183605467, + "loss": 1.3178, + "step": 32540 + }, + { + "epoch": 1.74, + "eval_loss": 1.393633246421814, + "eval_runtime": 49.9132, + "eval_samples_per_second": 60.104, + "eval_steps_per_second": 1.883, + "step": 32550 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018352706153387625, + "loss": 1.3195, + "step": 32560 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018351632123169787, + "loss": 1.3246, + "step": 32580 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018350558092951945, + "loss": 1.334, + "step": 32600 + }, + { + "epoch": 1.74, + "eval_loss": 1.3931056261062622, + "eval_runtime": 49.9067, + "eval_samples_per_second": 60.112, + "eval_steps_per_second": 1.884, + "step": 32600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018349484062734107, + "loss": 1.3078, + "step": 32620 + }, + { + "epoch": 1.74, + "learning_rate": 0.00018348410032516265, + "loss": 1.3127, + "step": 32640 + }, + { + "epoch": 1.74, + "eval_loss": 1.3928685188293457, + "eval_runtime": 49.9522, + "eval_samples_per_second": 60.057, + "eval_steps_per_second": 1.882, + "step": 32650 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018347336002298424, + "loss": 1.335, + "step": 32660 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018346261972080585, + "loss": 1.3122, + "step": 32680 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018345187941862744, + "loss": 1.3307, + "step": 32700 + }, + { + "epoch": 1.75, + "eval_loss": 1.3928115367889404, + "eval_runtime": 49.8507, + "eval_samples_per_second": 60.18, + "eval_steps_per_second": 1.886, + "step": 32700 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018344113911644905, + "loss": 1.3359, + "step": 32720 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018343039881427064, + "loss": 1.3388, + "step": 32740 + }, + { + "epoch": 1.75, + "eval_loss": 1.394007921218872, + "eval_runtime": 49.8839, + "eval_samples_per_second": 60.14, + "eval_steps_per_second": 1.884, + "step": 32750 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018341965851209225, + "loss": 1.2486, + "step": 32760 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018340891820991384, + "loss": 1.3449, + "step": 32780 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018339817790773543, + "loss": 1.3296, + "step": 32800 + }, + { + "epoch": 1.75, + "eval_loss": 1.3931111097335815, + "eval_runtime": 49.8634, + "eval_samples_per_second": 60.164, + "eval_steps_per_second": 1.885, + "step": 32800 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018338743760555704, + "loss": 1.3501, + "step": 32820 + }, + { + "epoch": 1.75, + "learning_rate": 0.00018337669730337863, + "loss": 1.321, + "step": 32840 + }, + { + "epoch": 1.76, + "eval_loss": 1.3937820196151733, + "eval_runtime": 49.9247, + "eval_samples_per_second": 60.09, + "eval_steps_per_second": 1.883, + "step": 32850 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018336595700120024, + "loss": 1.3138, + "step": 32860 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018335521669902183, + "loss": 1.3292, + "step": 32880 + }, + { + "epoch": 1.76, + "learning_rate": 0.0001833444763968434, + "loss": 1.2905, + "step": 32900 + }, + { + "epoch": 1.76, + "eval_loss": 1.3924466371536255, + "eval_runtime": 49.9006, + "eval_samples_per_second": 60.12, + "eval_steps_per_second": 1.884, + "step": 32900 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018333373609466505, + "loss": 1.3164, + "step": 32920 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018332299579248664, + "loss": 1.3677, + "step": 32940 + }, + { + "epoch": 1.76, + "eval_loss": 1.3932099342346191, + "eval_runtime": 49.9247, + "eval_samples_per_second": 60.09, + "eval_steps_per_second": 1.883, + "step": 32950 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018331225549030823, + "loss": 1.2867, + "step": 32960 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018330151518812984, + "loss": 1.302, + "step": 32980 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018329077488595143, + "loss": 1.3096, + "step": 33000 + }, + { + "epoch": 1.76, + "eval_loss": 1.3928850889205933, + "eval_runtime": 49.889, + "eval_samples_per_second": 60.133, + "eval_steps_per_second": 1.884, + "step": 33000 + }, + { + "epoch": 1.76, + "learning_rate": 0.00018328003458377304, + "loss": 1.3185, + "step": 33020 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018326929428159463, + "loss": 1.325, + "step": 33040 + }, + { + "epoch": 1.77, + "eval_loss": 1.3931729793548584, + "eval_runtime": 49.9041, + "eval_samples_per_second": 60.115, + "eval_steps_per_second": 1.884, + "step": 33050 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001832585539794162, + "loss": 1.2995, + "step": 33060 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018324781367723783, + "loss": 1.3011, + "step": 33080 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001832370733750594, + "loss": 1.3019, + "step": 33100 + }, + { + "epoch": 1.77, + "eval_loss": 1.3940869569778442, + "eval_runtime": 49.8445, + "eval_samples_per_second": 60.187, + "eval_steps_per_second": 1.886, + "step": 33100 + }, + { + "epoch": 1.77, + "learning_rate": 0.00018322633307288103, + "loss": 1.3061, + "step": 33120 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001832155927707026, + "loss": 1.329, + "step": 33140 + }, + { + "epoch": 1.77, + "eval_loss": 1.393612265586853, + "eval_runtime": 49.8664, + "eval_samples_per_second": 60.161, + "eval_steps_per_second": 1.885, + "step": 33150 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001832048524685242, + "loss": 1.3326, + "step": 33160 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001831941121663458, + "loss": 1.343, + "step": 33180 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001831833718641674, + "loss": 1.3023, + "step": 33200 + }, + { + "epoch": 1.77, + "eval_loss": 1.393221139907837, + "eval_runtime": 49.7506, + "eval_samples_per_second": 60.301, + "eval_steps_per_second": 1.889, + "step": 33200 + }, + { + "epoch": 1.77, + "learning_rate": 0.000183172631561989, + "loss": 1.3442, + "step": 33220 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001831618912598106, + "loss": 1.3143, + "step": 33240 + }, + { + "epoch": 1.78, + "eval_loss": 1.3926904201507568, + "eval_runtime": 49.9123, + "eval_samples_per_second": 60.105, + "eval_steps_per_second": 1.883, + "step": 33250 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001831511509576322, + "loss": 1.3522, + "step": 33260 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001831404106554538, + "loss": 1.3147, + "step": 33280 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018312967035327539, + "loss": 1.3086, + "step": 33300 + }, + { + "epoch": 1.78, + "eval_loss": 1.3926082849502563, + "eval_runtime": 49.9413, + "eval_samples_per_second": 60.071, + "eval_steps_per_second": 1.882, + "step": 33300 + }, + { + "epoch": 1.78, + "learning_rate": 0.000183118930051097, + "loss": 1.3294, + "step": 33320 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018310818974891859, + "loss": 1.3597, + "step": 33340 + }, + { + "epoch": 1.78, + "eval_loss": 1.3932656049728394, + "eval_runtime": 49.9378, + "eval_samples_per_second": 60.075, + "eval_steps_per_second": 1.882, + "step": 33350 + }, + { + "epoch": 1.78, + "learning_rate": 0.0001830974494467402, + "loss": 1.2729, + "step": 33360 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018308670914456179, + "loss": 1.3344, + "step": 33380 + }, + { + "epoch": 1.78, + "learning_rate": 0.00018307596884238337, + "loss": 1.3442, + "step": 33400 + }, + { + "epoch": 1.78, + "eval_loss": 1.39275062084198, + "eval_runtime": 49.874, + "eval_samples_per_second": 60.152, + "eval_steps_per_second": 1.885, + "step": 33400 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018306522854020499, + "loss": 1.3094, + "step": 33420 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018305448823802657, + "loss": 1.315, + "step": 33440 + }, + { + "epoch": 1.79, + "eval_loss": 1.392531394958496, + "eval_runtime": 49.9096, + "eval_samples_per_second": 60.109, + "eval_steps_per_second": 1.883, + "step": 33450 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018304374793584819, + "loss": 1.3088, + "step": 33460 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018303300763366977, + "loss": 1.3537, + "step": 33480 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018302226733149136, + "loss": 1.3193, + "step": 33500 + }, + { + "epoch": 1.79, + "eval_loss": 1.391752004623413, + "eval_runtime": 49.9064, + "eval_samples_per_second": 60.113, + "eval_steps_per_second": 1.884, + "step": 33500 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018301152702931297, + "loss": 1.305, + "step": 33520 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018300078672713459, + "loss": 1.3429, + "step": 33540 + }, + { + "epoch": 1.79, + "eval_loss": 1.3929482698440552, + "eval_runtime": 49.9527, + "eval_samples_per_second": 60.057, + "eval_steps_per_second": 1.882, + "step": 33550 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018299004642495617, + "loss": 1.2902, + "step": 33560 + }, + { + "epoch": 1.79, + "learning_rate": 0.00018297930612277779, + "loss": 1.3509, + "step": 33580 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018296856582059937, + "loss": 1.298, + "step": 33600 + }, + { + "epoch": 1.8, + "eval_loss": 1.3924405574798584, + "eval_runtime": 49.8811, + "eval_samples_per_second": 60.143, + "eval_steps_per_second": 1.884, + "step": 33600 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018295782551842099, + "loss": 1.3535, + "step": 33620 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018294708521624257, + "loss": 1.3262, + "step": 33640 + }, + { + "epoch": 1.8, + "eval_loss": 1.3920501470565796, + "eval_runtime": 49.9199, + "eval_samples_per_second": 60.096, + "eval_steps_per_second": 1.883, + "step": 33650 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018293634491406419, + "loss": 1.3319, + "step": 33660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018292560461188577, + "loss": 1.3148, + "step": 33680 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018291486430970736, + "loss": 1.3394, + "step": 33700 + }, + { + "epoch": 1.8, + "eval_loss": 1.392612338066101, + "eval_runtime": 49.9021, + "eval_samples_per_second": 60.118, + "eval_steps_per_second": 1.884, + "step": 33700 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018290412400752897, + "loss": 1.3368, + "step": 33720 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018289338370535056, + "loss": 1.3229, + "step": 33740 + }, + { + "epoch": 1.8, + "eval_loss": 1.392616868019104, + "eval_runtime": 49.9073, + "eval_samples_per_second": 60.112, + "eval_steps_per_second": 1.883, + "step": 33750 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018288264340317217, + "loss": 1.2909, + "step": 33760 + }, + { + "epoch": 1.8, + "learning_rate": 0.00018287190310099376, + "loss": 1.3625, + "step": 33780 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018286116279881535, + "loss": 1.2939, + "step": 33800 + }, + { + "epoch": 1.81, + "eval_loss": 1.3921815156936646, + "eval_runtime": 49.9158, + "eval_samples_per_second": 60.101, + "eval_steps_per_second": 1.883, + "step": 33800 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018285042249663696, + "loss": 1.3279, + "step": 33820 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018283968219445855, + "loss": 1.3285, + "step": 33840 + }, + { + "epoch": 1.81, + "eval_loss": 1.3925936222076416, + "eval_runtime": 49.9425, + "eval_samples_per_second": 60.069, + "eval_steps_per_second": 1.882, + "step": 33850 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018282894189228016, + "loss": 1.2904, + "step": 33860 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018281820159010175, + "loss": 1.3316, + "step": 33880 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018280746128792333, + "loss": 1.2796, + "step": 33900 + }, + { + "epoch": 1.81, + "eval_loss": 1.3928660154342651, + "eval_runtime": 49.915, + "eval_samples_per_second": 60.102, + "eval_steps_per_second": 1.883, + "step": 33900 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018279672098574495, + "loss": 1.3583, + "step": 33920 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018278598068356653, + "loss": 1.3029, + "step": 33940 + }, + { + "epoch": 1.81, + "eval_loss": 1.3938392400741577, + "eval_runtime": 49.9412, + "eval_samples_per_second": 60.071, + "eval_steps_per_second": 1.882, + "step": 33950 + }, + { + "epoch": 1.81, + "learning_rate": 0.00018277524038138815, + "loss": 1.2918, + "step": 33960 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018276450007920973, + "loss": 1.2994, + "step": 33980 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018275375977703132, + "loss": 1.3251, + "step": 34000 + }, + { + "epoch": 1.82, + "eval_loss": 1.3923838138580322, + "eval_runtime": 49.9075, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 34000 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018274301947485293, + "loss": 1.3138, + "step": 34020 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018273227917267452, + "loss": 1.2666, + "step": 34040 + }, + { + "epoch": 1.82, + "eval_loss": 1.3924466371536255, + "eval_runtime": 49.9932, + "eval_samples_per_second": 60.008, + "eval_steps_per_second": 1.88, + "step": 34050 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018272153887049613, + "loss": 1.3173, + "step": 34060 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018271079856831772, + "loss": 1.3349, + "step": 34080 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001827000582661393, + "loss": 1.3318, + "step": 34100 + }, + { + "epoch": 1.82, + "eval_loss": 1.3935117721557617, + "eval_runtime": 49.9078, + "eval_samples_per_second": 60.111, + "eval_steps_per_second": 1.883, + "step": 34100 + }, + { + "epoch": 1.82, + "learning_rate": 0.00018268931796396092, + "loss": 1.2808, + "step": 34120 + }, + { + "epoch": 1.82, + "learning_rate": 0.0001826785776617825, + "loss": 1.3121, + "step": 34140 + }, + { + "epoch": 1.82, + "eval_loss": 1.3931491374969482, + "eval_runtime": 49.9165, + "eval_samples_per_second": 60.1, + "eval_steps_per_second": 1.883, + "step": 34150 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018266837437471304, + "loss": 1.3206, + "step": 34160 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018265763407253465, + "loss": 1.2922, + "step": 34180 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018264689377035624, + "loss": 1.351, + "step": 34200 + }, + { + "epoch": 1.83, + "eval_loss": 1.3933879137039185, + "eval_runtime": 49.872, + "eval_samples_per_second": 60.154, + "eval_steps_per_second": 1.885, + "step": 34200 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018263615346817783, + "loss": 1.3538, + "step": 34220 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018262541316599944, + "loss": 1.3316, + "step": 34240 + }, + { + "epoch": 1.83, + "eval_loss": 1.393222689628601, + "eval_runtime": 49.8959, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 34250 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018261467286382103, + "loss": 1.3685, + "step": 34260 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018260393256164264, + "loss": 1.2776, + "step": 34280 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018259319225946423, + "loss": 1.2839, + "step": 34300 + }, + { + "epoch": 1.83, + "eval_loss": 1.3942714929580688, + "eval_runtime": 49.8823, + "eval_samples_per_second": 60.142, + "eval_steps_per_second": 1.884, + "step": 34300 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018258245195728584, + "loss": 1.3426, + "step": 34320 + }, + { + "epoch": 1.83, + "learning_rate": 0.00018257171165510743, + "loss": 1.3647, + "step": 34340 + }, + { + "epoch": 1.84, + "eval_loss": 1.393857479095459, + "eval_runtime": 49.9322, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 34350 + }, + { + "epoch": 1.84, + "learning_rate": 0.000182560971352929, + "loss": 1.2837, + "step": 34360 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018255023105075063, + "loss": 1.3124, + "step": 34380 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001825394907485722, + "loss": 1.3185, + "step": 34400 + }, + { + "epoch": 1.84, + "eval_loss": 1.3932018280029297, + "eval_runtime": 49.8908, + "eval_samples_per_second": 60.131, + "eval_steps_per_second": 1.884, + "step": 34400 + }, + { + "epoch": 1.84, + "learning_rate": 0.00018252875044639383, + "loss": 1.2794, + "step": 34420 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001825180101442154, + "loss": 1.3282, + "step": 34440 + }, + { + "epoch": 1.84, + "eval_loss": 1.3929858207702637, + "eval_runtime": 49.9429, + "eval_samples_per_second": 60.069, + "eval_steps_per_second": 1.882, + "step": 34450 + }, + { + "epoch": 1.84, + "learning_rate": 0.000182507269842037, + "loss": 1.306, + "step": 34460 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001824965295398586, + "loss": 1.2997, + "step": 34480 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001824857892376802, + "loss": 1.3054, + "step": 34500 + }, + { + "epoch": 1.84, + "eval_loss": 1.3928805589675903, + "eval_runtime": 49.8978, + "eval_samples_per_second": 60.123, + "eval_steps_per_second": 1.884, + "step": 34500 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001824750489355018, + "loss": 1.2943, + "step": 34520 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001824643086333234, + "loss": 1.3219, + "step": 34540 + }, + { + "epoch": 1.85, + "eval_loss": 1.3927184343338013, + "eval_runtime": 49.9209, + "eval_samples_per_second": 60.095, + "eval_steps_per_second": 1.883, + "step": 34550 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018245356833114499, + "loss": 1.3242, + "step": 34560 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001824428280289666, + "loss": 1.3142, + "step": 34580 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018243208772678819, + "loss": 1.3121, + "step": 34600 + }, + { + "epoch": 1.85, + "eval_loss": 1.39323890209198, + "eval_runtime": 49.894, + "eval_samples_per_second": 60.128, + "eval_steps_per_second": 1.884, + "step": 34600 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001824213474246098, + "loss": 1.3113, + "step": 34620 + }, + { + "epoch": 1.85, + "learning_rate": 0.00018241060712243139, + "loss": 1.3263, + "step": 34640 + }, + { + "epoch": 1.85, + "eval_loss": 1.393105149269104, + "eval_runtime": 49.9328, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 34650 + }, + { + "epoch": 1.85, + "learning_rate": 0.000182399866820253, + "loss": 1.3155, + "step": 34660 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001823891265180746, + "loss": 1.3247, + "step": 34680 + }, + { + "epoch": 1.85, + "learning_rate": 0.0001823783862158962, + "loss": 1.332, + "step": 34700 + }, + { + "epoch": 1.85, + "eval_loss": 1.3934316635131836, + "eval_runtime": 49.9008, + "eval_samples_per_second": 60.119, + "eval_steps_per_second": 1.884, + "step": 34700 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018236764591371779, + "loss": 1.3021, + "step": 34720 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001823569056115394, + "loss": 1.3075, + "step": 34740 + }, + { + "epoch": 1.86, + "eval_loss": 1.3924082517623901, + "eval_runtime": 49.9454, + "eval_samples_per_second": 60.066, + "eval_steps_per_second": 1.882, + "step": 34750 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018234616530936099, + "loss": 1.3134, + "step": 34760 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001823354250071826, + "loss": 1.3305, + "step": 34780 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018232468470500419, + "loss": 1.292, + "step": 34800 + }, + { + "epoch": 1.86, + "eval_loss": 1.3928916454315186, + "eval_runtime": 49.9539, + "eval_samples_per_second": 60.055, + "eval_steps_per_second": 1.882, + "step": 34800 + }, + { + "epoch": 1.86, + "learning_rate": 0.0001823139444028258, + "loss": 1.3188, + "step": 34820 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018230320410064739, + "loss": 1.2984, + "step": 34840 + }, + { + "epoch": 1.86, + "eval_loss": 1.3925448656082153, + "eval_runtime": 49.9779, + "eval_samples_per_second": 60.027, + "eval_steps_per_second": 1.881, + "step": 34850 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018229246379846897, + "loss": 1.3219, + "step": 34860 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018228172349629059, + "loss": 1.3085, + "step": 34880 + }, + { + "epoch": 1.86, + "learning_rate": 0.00018227098319411217, + "loss": 1.2877, + "step": 34900 + }, + { + "epoch": 1.86, + "eval_loss": 1.392621397972107, + "eval_runtime": 49.9427, + "eval_samples_per_second": 60.069, + "eval_steps_per_second": 1.882, + "step": 34900 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018226024289193379, + "loss": 1.3283, + "step": 34920 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018224950258975537, + "loss": 1.3525, + "step": 34940 + }, + { + "epoch": 1.87, + "eval_loss": 1.3935717344284058, + "eval_runtime": 49.9434, + "eval_samples_per_second": 60.068, + "eval_steps_per_second": 1.882, + "step": 34950 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018223876228757696, + "loss": 1.3074, + "step": 34960 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018222802198539857, + "loss": 1.321, + "step": 34980 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018221728168322016, + "loss": 1.2963, + "step": 35000 + }, + { + "epoch": 1.87, + "eval_loss": 1.393733024597168, + "eval_runtime": 49.896, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 35000 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018220654138104177, + "loss": 1.3222, + "step": 35020 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018219580107886336, + "loss": 1.3408, + "step": 35040 + }, + { + "epoch": 1.87, + "eval_loss": 1.3933889865875244, + "eval_runtime": 49.9267, + "eval_samples_per_second": 60.088, + "eval_steps_per_second": 1.883, + "step": 35050 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018218506077668495, + "loss": 1.2484, + "step": 35060 + }, + { + "epoch": 1.87, + "learning_rate": 0.00018217432047450656, + "loss": 1.3439, + "step": 35080 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018216358017232815, + "loss": 1.303, + "step": 35100 + }, + { + "epoch": 1.88, + "eval_loss": 1.3932467699050903, + "eval_runtime": 49.9093, + "eval_samples_per_second": 60.109, + "eval_steps_per_second": 1.883, + "step": 35100 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018215283987014976, + "loss": 1.3138, + "step": 35120 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018214209956797135, + "loss": 1.3562, + "step": 35140 + }, + { + "epoch": 1.88, + "eval_loss": 1.3933935165405273, + "eval_runtime": 49.9354, + "eval_samples_per_second": 60.078, + "eval_steps_per_second": 1.882, + "step": 35150 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018213135926579293, + "loss": 1.3609, + "step": 35160 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018212061896361455, + "loss": 1.305, + "step": 35180 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018210987866143613, + "loss": 1.265, + "step": 35200 + }, + { + "epoch": 1.88, + "eval_loss": 1.393653154373169, + "eval_runtime": 49.9016, + "eval_samples_per_second": 60.118, + "eval_steps_per_second": 1.884, + "step": 35200 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018209913835925775, + "loss": 1.339, + "step": 35220 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018208839805707933, + "loss": 1.3163, + "step": 35240 + }, + { + "epoch": 1.88, + "eval_loss": 1.3932678699493408, + "eval_runtime": 49.9449, + "eval_samples_per_second": 60.066, + "eval_steps_per_second": 1.882, + "step": 35250 + }, + { + "epoch": 1.88, + "learning_rate": 0.00018207765775490095, + "loss": 1.3401, + "step": 35260 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018206691745272256, + "loss": 1.3001, + "step": 35280 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018205617715054415, + "loss": 1.2924, + "step": 35300 + }, + { + "epoch": 1.89, + "eval_loss": 1.393690586090088, + "eval_runtime": 49.912, + "eval_samples_per_second": 60.106, + "eval_steps_per_second": 1.883, + "step": 35300 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018204543684836576, + "loss": 1.2857, + "step": 35320 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018203469654618735, + "loss": 1.3263, + "step": 35340 + }, + { + "epoch": 1.89, + "eval_loss": 1.3942912817001343, + "eval_runtime": 49.8166, + "eval_samples_per_second": 60.221, + "eval_steps_per_second": 1.887, + "step": 35350 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018202395624400893, + "loss": 1.2991, + "step": 35360 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018201321594183055, + "loss": 1.2858, + "step": 35380 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018200247563965213, + "loss": 1.2958, + "step": 35400 + }, + { + "epoch": 1.89, + "eval_loss": 1.3931233882904053, + "eval_runtime": 49.8931, + "eval_samples_per_second": 60.129, + "eval_steps_per_second": 1.884, + "step": 35400 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018199173533747375, + "loss": 1.3178, + "step": 35420 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018198099503529533, + "loss": 1.3069, + "step": 35440 + }, + { + "epoch": 1.89, + "eval_loss": 1.3929921388626099, + "eval_runtime": 49.9464, + "eval_samples_per_second": 60.064, + "eval_steps_per_second": 1.882, + "step": 35450 + }, + { + "epoch": 1.89, + "learning_rate": 0.00018197025473311692, + "loss": 1.3325, + "step": 35460 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018195951443093853, + "loss": 1.2918, + "step": 35480 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018194877412876012, + "loss": 1.2896, + "step": 35500 + }, + { + "epoch": 1.9, + "eval_loss": 1.393085241317749, + "eval_runtime": 49.8964, + "eval_samples_per_second": 60.125, + "eval_steps_per_second": 1.884, + "step": 35500 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018193803382658173, + "loss": 1.3011, + "step": 35520 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018192729352440332, + "loss": 1.2965, + "step": 35540 + }, + { + "epoch": 1.9, + "eval_loss": 1.3940566778182983, + "eval_runtime": 49.9327, + "eval_samples_per_second": 60.081, + "eval_steps_per_second": 1.883, + "step": 35550 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001819165532222249, + "loss": 1.3188, + "step": 35560 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018190581292004652, + "loss": 1.3119, + "step": 35580 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001818950726178681, + "loss": 1.3274, + "step": 35600 + }, + { + "epoch": 1.9, + "eval_loss": 1.3926782608032227, + "eval_runtime": 49.9185, + "eval_samples_per_second": 60.098, + "eval_steps_per_second": 1.883, + "step": 35600 + }, + { + "epoch": 1.9, + "learning_rate": 0.00018188433231568972, + "loss": 1.348, + "step": 35620 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001818735920135113, + "loss": 1.3444, + "step": 35640 + }, + { + "epoch": 1.9, + "eval_loss": 1.3924182653427124, + "eval_runtime": 49.9557, + "eval_samples_per_second": 60.053, + "eval_steps_per_second": 1.882, + "step": 35650 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001818628517113329, + "loss": 1.2899, + "step": 35660 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001818521114091545, + "loss": 1.2777, + "step": 35680 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001818413711069761, + "loss": 1.3048, + "step": 35700 + }, + { + "epoch": 1.91, + "eval_loss": 1.393597960472107, + "eval_runtime": 49.9014, + "eval_samples_per_second": 60.119, + "eval_steps_per_second": 1.884, + "step": 35700 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001818306308047977, + "loss": 1.3098, + "step": 35720 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001818198905026193, + "loss": 1.3244, + "step": 35740 + }, + { + "epoch": 1.91, + "eval_loss": 1.3939756155014038, + "eval_runtime": 49.9147, + "eval_samples_per_second": 60.103, + "eval_steps_per_second": 1.883, + "step": 35750 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001818091502004409, + "loss": 1.3246, + "step": 35760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001817984098982625, + "loss": 1.3302, + "step": 35780 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018178766959608408, + "loss": 1.3189, + "step": 35800 + }, + { + "epoch": 1.91, + "eval_loss": 1.393617868423462, + "eval_runtime": 49.9014, + "eval_samples_per_second": 60.119, + "eval_steps_per_second": 1.884, + "step": 35800 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001817769292939057, + "loss": 1.2832, + "step": 35820 + }, + { + "epoch": 1.91, + "learning_rate": 0.00018176618899172728, + "loss": 1.3016, + "step": 35840 + }, + { + "epoch": 1.92, + "eval_loss": 1.3936024904251099, + "eval_runtime": 49.9571, + "eval_samples_per_second": 60.052, + "eval_steps_per_second": 1.882, + "step": 35850 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001817554486895489, + "loss": 1.3014, + "step": 35860 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018174470838737048, + "loss": 1.2865, + "step": 35880 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018173396808519206, + "loss": 1.2856, + "step": 35900 + }, + { + "epoch": 1.92, + "eval_loss": 1.393366813659668, + "eval_runtime": 49.8692, + "eval_samples_per_second": 60.157, + "eval_steps_per_second": 1.885, + "step": 35900 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001817232277830137, + "loss": 1.3443, + "step": 35920 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001817124874808353, + "loss": 1.3438, + "step": 35940 + }, + { + "epoch": 1.92, + "eval_loss": 1.3926905393600464, + "eval_runtime": 49.8999, + "eval_samples_per_second": 60.12, + "eval_steps_per_second": 1.884, + "step": 35950 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018170174717865688, + "loss": 1.3326, + "step": 35960 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001816910068764785, + "loss": 1.2869, + "step": 35980 + }, + { + "epoch": 1.92, + "learning_rate": 0.00018168026657430008, + "loss": 1.3337, + "step": 36000 + }, + { + "epoch": 1.92, + "eval_loss": 1.3924248218536377, + "eval_runtime": 49.869, + "eval_samples_per_second": 60.158, + "eval_steps_per_second": 1.885, + "step": 36000 + }, + { + "epoch": 1.92, + "learning_rate": 0.0001816695262721217, + "loss": 1.3459, + "step": 36020 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018165878596994328, + "loss": 1.3044, + "step": 36040 + }, + { + "epoch": 1.93, + "eval_loss": 1.3930872678756714, + "eval_runtime": 49.9165, + "eval_samples_per_second": 60.1, + "eval_steps_per_second": 1.883, + "step": 36050 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018164804566776486, + "loss": 1.3095, + "step": 36060 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018163730536558648, + "loss": 1.3382, + "step": 36080 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018162656506340806, + "loss": 1.3207, + "step": 36100 + }, + { + "epoch": 1.93, + "eval_loss": 1.3932032585144043, + "eval_runtime": 49.8839, + "eval_samples_per_second": 60.14, + "eval_steps_per_second": 1.884, + "step": 36100 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018161582476122968, + "loss": 1.3144, + "step": 36120 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018160508445905126, + "loss": 1.3294, + "step": 36140 + }, + { + "epoch": 1.93, + "eval_loss": 1.392491102218628, + "eval_runtime": 49.925, + "eval_samples_per_second": 60.09, + "eval_steps_per_second": 1.883, + "step": 36150 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018159434415687288, + "loss": 1.3302, + "step": 36160 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018158360385469446, + "loss": 1.3235, + "step": 36180 + }, + { + "epoch": 1.93, + "learning_rate": 0.00018157286355251605, + "loss": 1.3333, + "step": 36200 + }, + { + "epoch": 1.93, + "eval_loss": 1.3923717737197876, + "eval_runtime": 49.912, + "eval_samples_per_second": 60.106, + "eval_steps_per_second": 1.883, + "step": 36200 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018156212325033766, + "loss": 1.3188, + "step": 36220 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018155138294815925, + "loss": 1.3098, + "step": 36240 + }, + { + "epoch": 1.94, + "eval_loss": 1.3927510976791382, + "eval_runtime": 49.9002, + "eval_samples_per_second": 60.12, + "eval_steps_per_second": 1.884, + "step": 36250 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018154064264598086, + "loss": 1.3403, + "step": 36260 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018152990234380245, + "loss": 1.3093, + "step": 36280 + }, + { + "epoch": 1.94, + "learning_rate": 0.00018151916204162404, + "loss": 1.3128, + "step": 36300 + }, + { + "epoch": 1.94, + "eval_loss": 1.391266107559204, + "eval_runtime": 49.963, + "eval_samples_per_second": 60.044, + "eval_steps_per_second": 1.881, + "step": 36300 + } + ], + "max_steps": 374300, + "num_train_epochs": 20, + "total_flos": 4.634609115302619e+19, + "trial_name": null, + "trial_params": null +}