{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0009760716430288746, "global_step": 3100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0.0002, "loss": 4.3689, "step": 10 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 3.0166, "step": 20 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.6587, "step": 30 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.7002, "step": 40 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.8247, "step": 50 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.7709, "step": 60 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5964, "step": 70 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5285, "step": 80 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.6166, "step": 90 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.7509, "step": 100 }, { "epoch": 0.0, "eval_loss": 2.828864097595215, "eval_runtime": 25015.5353, "eval_samples_per_second": 4.048, "eval_steps_per_second": 2.024, "step": 100 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.6819, "step": 110 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5314, "step": 120 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4678, "step": 130 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5079, "step": 140 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.6501, "step": 150 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.6981, "step": 160 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5262, "step": 170 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4819, "step": 180 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4657, "step": 190 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.6072, "step": 200 }, { "epoch": 0.0, "eval_loss": 2.7019286155700684, "eval_runtime": 25020.9542, "eval_samples_per_second": 4.048, "eval_steps_per_second": 2.024, "step": 200 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5656, "step": 210 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5185, "step": 220 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4271, "step": 230 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4286, "step": 240 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5728, "step": 250 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5597, "step": 260 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4337, "step": 270 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4491, "step": 280 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4494, "step": 290 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5075, "step": 300 }, { "epoch": 0.0, "eval_loss": 2.6475937366485596, "eval_runtime": 25026.694, "eval_samples_per_second": 4.047, "eval_steps_per_second": 2.023, "step": 300 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5654, "step": 310 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.445, "step": 320 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4604, "step": 330 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4279, "step": 340 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.545, "step": 350 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.579, "step": 360 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4052, "step": 370 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3807, "step": 380 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4444, "step": 390 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4991, "step": 400 }, { "epoch": 0.0, "eval_loss": 2.606245279312134, "eval_runtime": 25030.0633, "eval_samples_per_second": 4.046, "eval_steps_per_second": 2.023, "step": 400 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.6451, "step": 410 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4054, "step": 420 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4127, "step": 430 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3986, "step": 440 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4556, "step": 450 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.531, "step": 460 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4173, "step": 470 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4098, "step": 480 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3917, "step": 490 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5342, "step": 500 }, { "epoch": 0.0, "eval_loss": 2.553729772567749, "eval_runtime": 25028.6332, "eval_samples_per_second": 4.046, "eval_steps_per_second": 2.023, "step": 500 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5335, "step": 510 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4117, "step": 520 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3891, "step": 530 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3963, "step": 540 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4859, "step": 550 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5574, "step": 560 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4218, "step": 570 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3816, "step": 580 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3866, "step": 590 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4992, "step": 600 }, { "epoch": 0.0, "eval_loss": 2.5366227626800537, "eval_runtime": 25013.3524, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 600 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5315, "step": 610 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4261, "step": 620 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3737, "step": 630 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.385, "step": 640 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.405, "step": 650 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5182, "step": 660 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3498, "step": 670 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3539, "step": 680 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3276, "step": 690 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4602, "step": 700 }, { "epoch": 0.0, "eval_loss": 2.5101232528686523, "eval_runtime": 25025.24, "eval_samples_per_second": 4.047, "eval_steps_per_second": 2.023, "step": 700 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5107, "step": 710 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3919, "step": 720 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3818, "step": 730 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3866, "step": 740 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4079, "step": 750 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4908, "step": 760 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3965, "step": 770 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.323, "step": 780 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3136, "step": 790 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3786, "step": 800 }, { "epoch": 0.0, "eval_loss": 2.50821590423584, "eval_runtime": 25013.6917, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 800 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5106, "step": 810 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3971, "step": 820 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3305, "step": 830 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3711, "step": 840 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4803, "step": 850 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4738, "step": 860 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3712, "step": 870 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3463, "step": 880 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2843, "step": 890 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3725, "step": 900 }, { "epoch": 0.0, "eval_loss": 2.494638204574585, "eval_runtime": 25014.3413, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 900 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.5215, "step": 910 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3802, "step": 920 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3499, "step": 930 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3199, "step": 940 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3491, "step": 950 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4885, "step": 960 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3633, "step": 970 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3226, "step": 980 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2587, "step": 990 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3803, "step": 1000 }, { "epoch": 0.0, "eval_loss": 2.458460807800293, "eval_runtime": 25021.0121, "eval_samples_per_second": 4.048, "eval_steps_per_second": 2.024, "step": 1000 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3921, "step": 1010 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3305, "step": 1020 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2907, "step": 1030 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.319, "step": 1040 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4507, "step": 1050 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.467, "step": 1060 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3639, "step": 1070 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3189, "step": 1080 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2974, "step": 1090 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3331, "step": 1100 }, { "epoch": 0.0, "eval_loss": 2.454998254776001, "eval_runtime": 25014.8749, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 1100 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4715, "step": 1110 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3549, "step": 1120 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3196, "step": 1130 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2633, "step": 1140 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3661, "step": 1150 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4783, "step": 1160 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3531, "step": 1170 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2943, "step": 1180 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2409, "step": 1190 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3605, "step": 1200 }, { "epoch": 0.0, "eval_loss": 2.448063611984253, "eval_runtime": 25013.9619, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 1200 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4612, "step": 1210 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3746, "step": 1220 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.304, "step": 1230 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2925, "step": 1240 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2926, "step": 1250 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4145, "step": 1260 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3593, "step": 1270 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2417, "step": 1280 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.301, "step": 1290 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3336, "step": 1300 }, { "epoch": 0.0, "eval_loss": 2.424609661102295, "eval_runtime": 25025.8125, "eval_samples_per_second": 4.047, "eval_steps_per_second": 2.023, "step": 1300 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4126, "step": 1310 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.308, "step": 1320 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3052, "step": 1330 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2798, "step": 1340 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3208, "step": 1350 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3309, "step": 1360 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.331, "step": 1370 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3068, "step": 1380 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1971, "step": 1390 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3114, "step": 1400 }, { "epoch": 0.0, "eval_loss": 2.4197137355804443, "eval_runtime": 25013.8484, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 1400 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4611, "step": 1410 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2463, "step": 1420 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2844, "step": 1430 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2544, "step": 1440 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3002, "step": 1450 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4263, "step": 1460 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.328, "step": 1470 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2609, "step": 1480 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2854, "step": 1490 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3257, "step": 1500 }, { "epoch": 0.0, "eval_loss": 2.407323122024536, "eval_runtime": 25024.2416, "eval_samples_per_second": 4.047, "eval_steps_per_second": 2.024, "step": 1500 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4174, "step": 1510 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2817, "step": 1520 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2796, "step": 1530 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2834, "step": 1540 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2803, "step": 1550 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.39, "step": 1560 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2672, "step": 1570 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3054, "step": 1580 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2576, "step": 1590 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3099, "step": 1600 }, { "epoch": 0.0, "eval_loss": 2.401474952697754, "eval_runtime": 25023.95, "eval_samples_per_second": 4.047, "eval_steps_per_second": 2.024, "step": 1600 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4225, "step": 1610 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3135, "step": 1620 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2705, "step": 1630 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2075, "step": 1640 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3811, "step": 1650 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4239, "step": 1660 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3319, "step": 1670 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3106, "step": 1680 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2231, "step": 1690 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.311, "step": 1700 }, { "epoch": 0.0, "eval_loss": 2.3863916397094727, "eval_runtime": 25021.8392, "eval_samples_per_second": 4.047, "eval_steps_per_second": 2.024, "step": 1700 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4332, "step": 1710 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3183, "step": 1720 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2177, "step": 1730 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3045, "step": 1740 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2627, "step": 1750 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4087, "step": 1760 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3026, "step": 1770 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3316, "step": 1780 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.256, "step": 1790 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3276, "step": 1800 }, { "epoch": 0.0, "eval_loss": 2.3831562995910645, "eval_runtime": 25030.8877, "eval_samples_per_second": 4.046, "eval_steps_per_second": 2.023, "step": 1800 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3993, "step": 1810 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3138, "step": 1820 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2652, "step": 1830 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2848, "step": 1840 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3009, "step": 1850 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3942, "step": 1860 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2951, "step": 1870 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2457, "step": 1880 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2514, "step": 1890 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2656, "step": 1900 }, { "epoch": 0.0, "eval_loss": 2.3777658939361572, "eval_runtime": 25024.5627, "eval_samples_per_second": 4.047, "eval_steps_per_second": 2.023, "step": 1900 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3702, "step": 1910 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2896, "step": 1920 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2635, "step": 1930 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2506, "step": 1940 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2108, "step": 1950 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3469, "step": 1960 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2863, "step": 1970 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3095, "step": 1980 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2594, "step": 1990 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3124, "step": 2000 }, { "epoch": 0.0, "eval_loss": 2.3710832595825195, "eval_runtime": 25009.9377, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.025, "step": 2000 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.42, "step": 2010 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2725, "step": 2020 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.243, "step": 2030 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1696, "step": 2040 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2764, "step": 2050 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.4149, "step": 2060 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3201, "step": 2070 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2489, "step": 2080 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1856, "step": 2090 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.295, "step": 2100 }, { "epoch": 0.0, "eval_loss": 2.3622617721557617, "eval_runtime": 25023.6158, "eval_samples_per_second": 4.047, "eval_steps_per_second": 2.024, "step": 2100 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3851, "step": 2110 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2984, "step": 2120 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.189, "step": 2130 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2425, "step": 2140 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.312, "step": 2150 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3269, "step": 2160 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3006, "step": 2170 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2974, "step": 2180 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1995, "step": 2190 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3021, "step": 2200 }, { "epoch": 0.0, "eval_loss": 2.356457471847534, "eval_runtime": 25014.0777, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 2200 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3836, "step": 2210 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2795, "step": 2220 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.196, "step": 2230 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2284, "step": 2240 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2464, "step": 2250 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.406, "step": 2260 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2814, "step": 2270 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2595, "step": 2280 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2578, "step": 2290 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2471, "step": 2300 }, { "epoch": 0.0, "eval_loss": 2.334439516067505, "eval_runtime": 25021.2061, "eval_samples_per_second": 4.048, "eval_steps_per_second": 2.024, "step": 2300 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3846, "step": 2310 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2797, "step": 2320 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2136, "step": 2330 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2303, "step": 2340 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3312, "step": 2350 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3484, "step": 2360 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3012, "step": 2370 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2255, "step": 2380 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1879, "step": 2390 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2432, "step": 2400 }, { "epoch": 0.0, "eval_loss": 2.3361294269561768, "eval_runtime": 25017.0273, "eval_samples_per_second": 4.048, "eval_steps_per_second": 2.024, "step": 2400 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3223, "step": 2410 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2756, "step": 2420 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2312, "step": 2430 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2296, "step": 2440 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2083, "step": 2450 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3684, "step": 2460 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2336, "step": 2470 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2262, "step": 2480 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1644, "step": 2490 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3096, "step": 2500 }, { "epoch": 0.0, "eval_loss": 2.336559772491455, "eval_runtime": 25015.5454, "eval_samples_per_second": 4.048, "eval_steps_per_second": 2.024, "step": 2500 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3185, "step": 2510 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2768, "step": 2520 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2151, "step": 2530 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2017, "step": 2540 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2616, "step": 2550 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3333, "step": 2560 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2404, "step": 2570 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.25, "step": 2580 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.22, "step": 2590 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2884, "step": 2600 }, { "epoch": 0.0, "eval_loss": 2.332972526550293, "eval_runtime": 25018.5289, "eval_samples_per_second": 4.048, "eval_steps_per_second": 2.024, "step": 2600 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3739, "step": 2610 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2611, "step": 2620 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2323, "step": 2630 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1285, "step": 2640 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2504, "step": 2650 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3694, "step": 2660 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2317, "step": 2670 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2714, "step": 2680 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1926, "step": 2690 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2617, "step": 2700 }, { "epoch": 0.0, "eval_loss": 2.325580596923828, "eval_runtime": 25014.9463, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 2700 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3665, "step": 2710 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2604, "step": 2720 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2129, "step": 2730 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2166, "step": 2740 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2418, "step": 2750 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3793, "step": 2760 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2684, "step": 2770 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1758, "step": 2780 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2116, "step": 2790 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2753, "step": 2800 }, { "epoch": 0.0, "eval_loss": 2.3111133575439453, "eval_runtime": 25017.4138, "eval_samples_per_second": 4.048, "eval_steps_per_second": 2.024, "step": 2800 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3754, "step": 2810 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2508, "step": 2820 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1793, "step": 2830 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1444, "step": 2840 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2093, "step": 2850 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2917, "step": 2860 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2487, "step": 2870 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1931, "step": 2880 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1862, "step": 2890 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3007, "step": 2900 }, { "epoch": 0.0, "eval_loss": 2.3113691806793213, "eval_runtime": 25009.312, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.025, "step": 2900 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3637, "step": 2910 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2385, "step": 2920 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2092, "step": 2930 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1755, "step": 2940 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2523, "step": 2950 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3127, "step": 2960 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2634, "step": 2970 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2487, "step": 2980 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1269, "step": 2990 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1926, "step": 3000 }, { "epoch": 0.0, "eval_loss": 2.2998297214508057, "eval_runtime": 25014.9441, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 3000 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3595, "step": 3010 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2325, "step": 3020 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1953, "step": 3030 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.184, "step": 3040 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1961, "step": 3050 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.3776, "step": 3060 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2627, "step": 3070 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2189, "step": 3080 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.1835, "step": 3090 }, { "epoch": 0.0, "learning_rate": 0.0002, "loss": 2.2283, "step": 3100 }, { "epoch": 0.0, "eval_loss": 2.2964084148406982, "eval_runtime": 25012.2645, "eval_samples_per_second": 4.049, "eval_steps_per_second": 2.024, "step": 3100 } ], "max_steps": 10000, "num_train_epochs": 1, "total_flos": 5.849399834018734e+18, "trial_name": null, "trial_params": null }