{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 9.95e-05, "loss": 7.5958, "step": 100 }, { "epoch": 0.01, "learning_rate": 9.900000000000001e-05, "loss": 0.3186, "step": 200 }, { "epoch": 0.01, "learning_rate": 9.850000000000001e-05, "loss": 0.2868, "step": 300 }, { "epoch": 0.02, "learning_rate": 9.8e-05, "loss": 0.2434, "step": 400 }, { "epoch": 0.03, "learning_rate": 9.75e-05, "loss": 0.1965, "step": 500 }, { "epoch": 0.03, "learning_rate": 9.7e-05, "loss": 0.1758, "step": 600 }, { "epoch": 0.04, "learning_rate": 9.65e-05, "loss": 0.1722, "step": 700 }, { "epoch": 0.04, "learning_rate": 9.6e-05, "loss": 0.1605, "step": 800 }, { "epoch": 0.04, "learning_rate": 9.55e-05, "loss": 0.1588, "step": 900 }, { "epoch": 0.05, "learning_rate": 9.5e-05, "loss": 0.1493, "step": 1000 }, { "epoch": 0.06, "learning_rate": 9.449999999999999e-05, "loss": 0.1429, "step": 1100 }, { "epoch": 0.06, "learning_rate": 9.4e-05, "loss": 0.1375, "step": 1200 }, { "epoch": 0.07, "learning_rate": 9.350000000000001e-05, "loss": 0.1378, "step": 1300 }, { "epoch": 0.07, "learning_rate": 9.300000000000001e-05, "loss": 0.1372, "step": 1400 }, { "epoch": 0.07, "learning_rate": 9.250000000000001e-05, "loss": 0.1291, "step": 1500 }, { "epoch": 0.08, "learning_rate": 9.200000000000001e-05, "loss": 0.129, "step": 1600 }, { "epoch": 0.09, "learning_rate": 9.15e-05, "loss": 0.1302, "step": 1700 }, { "epoch": 0.09, "learning_rate": 9.1e-05, "loss": 0.1255, "step": 1800 }, { "epoch": 0.1, "learning_rate": 9.05e-05, "loss": 0.1256, "step": 1900 }, { "epoch": 0.1, "learning_rate": 9e-05, "loss": 0.1214, "step": 2000 }, { "epoch": 0.1, "learning_rate": 8.950000000000001e-05, "loss": 0.1199, "step": 2100 }, { "epoch": 0.11, "learning_rate": 8.900000000000001e-05, "loss": 0.1193, "step": 2200 }, { "epoch": 0.12, "learning_rate": 8.850000000000001e-05, "loss": 0.1208, "step": 2300 }, { "epoch": 0.12, "learning_rate": 8.800000000000001e-05, "loss": 0.118, "step": 2400 }, { "epoch": 0.12, "learning_rate": 8.75e-05, "loss": 0.1167, "step": 2500 }, { "epoch": 0.13, "learning_rate": 8.7e-05, "loss": 0.1136, "step": 2600 }, { "epoch": 0.14, "learning_rate": 8.65e-05, "loss": 0.1164, "step": 2700 }, { "epoch": 0.14, "learning_rate": 8.6e-05, "loss": 0.1124, "step": 2800 }, { "epoch": 0.14, "learning_rate": 8.55e-05, "loss": 0.1138, "step": 2900 }, { "epoch": 0.15, "learning_rate": 8.5e-05, "loss": 0.114, "step": 3000 }, { "epoch": 0.15, "learning_rate": 8.450000000000001e-05, "loss": 0.1137, "step": 3100 }, { "epoch": 0.16, "learning_rate": 8.4e-05, "loss": 0.1096, "step": 3200 }, { "epoch": 0.17, "learning_rate": 8.35e-05, "loss": 0.1087, "step": 3300 }, { "epoch": 0.17, "learning_rate": 8.3e-05, "loss": 0.1079, "step": 3400 }, { "epoch": 0.17, "learning_rate": 8.25e-05, "loss": 0.1085, "step": 3500 }, { "epoch": 0.18, "learning_rate": 8.2e-05, "loss": 0.112, "step": 3600 }, { "epoch": 0.18, "learning_rate": 8.15e-05, "loss": 0.1071, "step": 3700 }, { "epoch": 0.19, "learning_rate": 8.1e-05, "loss": 0.1077, "step": 3800 }, { "epoch": 0.2, "learning_rate": 8.05e-05, "loss": 0.1059, "step": 3900 }, { "epoch": 0.2, "learning_rate": 8e-05, "loss": 0.1105, "step": 4000 }, { "epoch": 0.2, "learning_rate": 7.950000000000001e-05, "loss": 0.107, "step": 4100 }, { "epoch": 0.21, "learning_rate": 7.900000000000001e-05, "loss": 0.1064, "step": 4200 }, { "epoch": 0.21, "learning_rate": 7.850000000000001e-05, "loss": 0.1049, "step": 4300 }, { "epoch": 0.22, "learning_rate": 7.800000000000001e-05, "loss": 0.1043, "step": 4400 }, { "epoch": 0.23, "learning_rate": 7.75e-05, "loss": 0.1033, "step": 4500 }, { "epoch": 0.23, "learning_rate": 7.7e-05, "loss": 0.103, "step": 4600 }, { "epoch": 0.23, "learning_rate": 7.65e-05, "loss": 0.1033, "step": 4700 }, { "epoch": 0.24, "learning_rate": 7.6e-05, "loss": 0.1014, "step": 4800 }, { "epoch": 0.24, "learning_rate": 7.55e-05, "loss": 0.1038, "step": 4900 }, { "epoch": 0.25, "learning_rate": 7.500000000000001e-05, "loss": 0.1011, "step": 5000 }, { "epoch": 0.26, "learning_rate": 7.450000000000001e-05, "loss": 0.1011, "step": 5100 }, { "epoch": 0.26, "learning_rate": 7.4e-05, "loss": 0.1006, "step": 5200 }, { "epoch": 0.27, "learning_rate": 7.35e-05, "loss": 0.1024, "step": 5300 }, { "epoch": 0.27, "learning_rate": 7.3e-05, "loss": 0.1003, "step": 5400 }, { "epoch": 0.28, "learning_rate": 7.25e-05, "loss": 0.0997, "step": 5500 }, { "epoch": 0.28, "learning_rate": 7.2e-05, "loss": 0.1019, "step": 5600 }, { "epoch": 0.28, "learning_rate": 7.15e-05, "loss": 0.1025, "step": 5700 }, { "epoch": 0.29, "learning_rate": 7.1e-05, "loss": 0.0994, "step": 5800 }, { "epoch": 0.29, "learning_rate": 7.05e-05, "loss": 0.0982, "step": 5900 }, { "epoch": 0.3, "learning_rate": 7e-05, "loss": 0.0998, "step": 6000 }, { "epoch": 0.3, "learning_rate": 6.95e-05, "loss": 0.0966, "step": 6100 }, { "epoch": 0.31, "learning_rate": 6.9e-05, "loss": 0.096, "step": 6200 }, { "epoch": 0.32, "learning_rate": 6.850000000000001e-05, "loss": 0.0972, "step": 6300 }, { "epoch": 0.32, "learning_rate": 6.800000000000001e-05, "loss": 0.0973, "step": 6400 }, { "epoch": 0.33, "learning_rate": 6.750000000000001e-05, "loss": 0.0996, "step": 6500 }, { "epoch": 0.33, "learning_rate": 6.7e-05, "loss": 0.0961, "step": 6600 }, { "epoch": 0.34, "learning_rate": 6.65e-05, "loss": 0.0964, "step": 6700 }, { "epoch": 0.34, "learning_rate": 6.6e-05, "loss": 0.0966, "step": 6800 }, { "epoch": 0.34, "learning_rate": 6.55e-05, "loss": 0.0959, "step": 6900 }, { "epoch": 0.35, "learning_rate": 6.500000000000001e-05, "loss": 0.0975, "step": 7000 }, { "epoch": 0.35, "learning_rate": 6.450000000000001e-05, "loss": 0.0983, "step": 7100 }, { "epoch": 0.36, "learning_rate": 6.400000000000001e-05, "loss": 0.0989, "step": 7200 }, { "epoch": 0.36, "learning_rate": 6.35e-05, "loss": 0.095, "step": 7300 }, { "epoch": 0.37, "learning_rate": 6.3e-05, "loss": 0.0943, "step": 7400 }, { "epoch": 0.38, "learning_rate": 6.25e-05, "loss": 0.097, "step": 7500 }, { "epoch": 0.38, "learning_rate": 6.2e-05, "loss": 0.094, "step": 7600 }, { "epoch": 0.39, "learning_rate": 6.15e-05, "loss": 0.0942, "step": 7700 }, { "epoch": 0.39, "learning_rate": 6.1e-05, "loss": 0.0966, "step": 7800 }, { "epoch": 0.4, "learning_rate": 6.05e-05, "loss": 0.0934, "step": 7900 }, { "epoch": 0.4, "learning_rate": 6e-05, "loss": 0.0941, "step": 8000 }, { "epoch": 0.41, "learning_rate": 5.95e-05, "loss": 0.0947, "step": 8100 }, { "epoch": 0.41, "learning_rate": 5.9e-05, "loss": 0.0929, "step": 8200 }, { "epoch": 0.41, "learning_rate": 5.85e-05, "loss": 0.0911, "step": 8300 }, { "epoch": 0.42, "learning_rate": 5.8e-05, "loss": 0.0962, "step": 8400 }, { "epoch": 0.42, "learning_rate": 5.7499999999999995e-05, "loss": 0.0953, "step": 8500 }, { "epoch": 0.43, "learning_rate": 5.6999999999999996e-05, "loss": 0.0926, "step": 8600 }, { "epoch": 0.43, "learning_rate": 5.65e-05, "loss": 0.0957, "step": 8700 }, { "epoch": 0.44, "learning_rate": 5.6000000000000006e-05, "loss": 0.0935, "step": 8800 }, { "epoch": 0.45, "learning_rate": 5.550000000000001e-05, "loss": 0.0931, "step": 8900 }, { "epoch": 0.45, "learning_rate": 5.500000000000001e-05, "loss": 0.0937, "step": 9000 }, { "epoch": 0.46, "learning_rate": 5.45e-05, "loss": 0.0937, "step": 9100 }, { "epoch": 0.46, "learning_rate": 5.4000000000000005e-05, "loss": 0.0929, "step": 9200 }, { "epoch": 0.47, "learning_rate": 5.3500000000000006e-05, "loss": 0.0913, "step": 9300 }, { "epoch": 0.47, "learning_rate": 5.300000000000001e-05, "loss": 0.0929, "step": 9400 }, { "epoch": 0.47, "learning_rate": 5.25e-05, "loss": 0.0896, "step": 9500 }, { "epoch": 0.48, "learning_rate": 5.2000000000000004e-05, "loss": 0.0923, "step": 9600 }, { "epoch": 0.48, "learning_rate": 5.1500000000000005e-05, "loss": 0.0909, "step": 9700 }, { "epoch": 0.49, "learning_rate": 5.1000000000000006e-05, "loss": 0.0925, "step": 9800 }, { "epoch": 0.49, "learning_rate": 5.05e-05, "loss": 0.0934, "step": 9900 }, { "epoch": 0.5, "learning_rate": 5e-05, "loss": 0.0915, "step": 10000 }, { "epoch": 0.51, "learning_rate": 4.9500000000000004e-05, "loss": 0.0891, "step": 10100 }, { "epoch": 0.51, "learning_rate": 4.9e-05, "loss": 0.0898, "step": 10200 }, { "epoch": 0.52, "learning_rate": 4.85e-05, "loss": 0.0931, "step": 10300 }, { "epoch": 0.52, "learning_rate": 4.8e-05, "loss": 0.0905, "step": 10400 }, { "epoch": 0.53, "learning_rate": 4.75e-05, "loss": 0.0886, "step": 10500 }, { "epoch": 0.53, "learning_rate": 4.7e-05, "loss": 0.0872, "step": 10600 }, { "epoch": 0.54, "learning_rate": 4.6500000000000005e-05, "loss": 0.09, "step": 10700 }, { "epoch": 0.54, "learning_rate": 4.600000000000001e-05, "loss": 0.0894, "step": 10800 }, { "epoch": 0.55, "learning_rate": 4.55e-05, "loss": 0.0878, "step": 10900 }, { "epoch": 0.55, "learning_rate": 4.5e-05, "loss": 0.0903, "step": 11000 }, { "epoch": 0.56, "learning_rate": 4.4500000000000004e-05, "loss": 0.0904, "step": 11100 }, { "epoch": 0.56, "learning_rate": 4.4000000000000006e-05, "loss": 0.0905, "step": 11200 }, { "epoch": 0.56, "learning_rate": 4.35e-05, "loss": 0.0877, "step": 11300 }, { "epoch": 0.57, "learning_rate": 4.3e-05, "loss": 0.0888, "step": 11400 }, { "epoch": 0.57, "learning_rate": 4.25e-05, "loss": 0.0904, "step": 11500 }, { "epoch": 0.58, "learning_rate": 4.2e-05, "loss": 0.0903, "step": 11600 }, { "epoch": 0.58, "learning_rate": 4.15e-05, "loss": 0.0865, "step": 11700 }, { "epoch": 0.59, "learning_rate": 4.1e-05, "loss": 0.0895, "step": 11800 }, { "epoch": 0.59, "learning_rate": 4.05e-05, "loss": 0.0888, "step": 11900 }, { "epoch": 0.6, "learning_rate": 4e-05, "loss": 0.0884, "step": 12000 }, { "epoch": 0.6, "learning_rate": 3.9500000000000005e-05, "loss": 0.0897, "step": 12100 }, { "epoch": 0.61, "learning_rate": 3.9000000000000006e-05, "loss": 0.0891, "step": 12200 }, { "epoch": 0.61, "learning_rate": 3.85e-05, "loss": 0.0904, "step": 12300 }, { "epoch": 0.62, "learning_rate": 3.8e-05, "loss": 0.0864, "step": 12400 }, { "epoch": 0.62, "learning_rate": 3.7500000000000003e-05, "loss": 0.0871, "step": 12500 }, { "epoch": 0.63, "learning_rate": 3.7e-05, "loss": 0.0859, "step": 12600 }, { "epoch": 0.64, "learning_rate": 3.65e-05, "loss": 0.0878, "step": 12700 }, { "epoch": 0.64, "learning_rate": 3.6e-05, "loss": 0.0862, "step": 12800 }, { "epoch": 0.65, "learning_rate": 3.55e-05, "loss": 0.085, "step": 12900 }, { "epoch": 0.65, "learning_rate": 3.5e-05, "loss": 0.0885, "step": 13000 }, { "epoch": 0.66, "learning_rate": 3.45e-05, "loss": 0.0889, "step": 13100 }, { "epoch": 0.66, "learning_rate": 3.4000000000000007e-05, "loss": 0.0876, "step": 13200 }, { "epoch": 0.67, "learning_rate": 3.35e-05, "loss": 0.0882, "step": 13300 }, { "epoch": 0.67, "learning_rate": 3.3e-05, "loss": 0.0884, "step": 13400 }, { "epoch": 0.68, "learning_rate": 3.2500000000000004e-05, "loss": 0.0848, "step": 13500 }, { "epoch": 0.68, "learning_rate": 3.2000000000000005e-05, "loss": 0.0859, "step": 13600 }, { "epoch": 0.69, "learning_rate": 3.15e-05, "loss": 0.0895, "step": 13700 }, { "epoch": 0.69, "learning_rate": 3.1e-05, "loss": 0.0898, "step": 13800 }, { "epoch": 0.69, "learning_rate": 3.05e-05, "loss": 0.0849, "step": 13900 }, { "epoch": 0.7, "learning_rate": 3e-05, "loss": 0.0866, "step": 14000 }, { "epoch": 0.7, "learning_rate": 2.95e-05, "loss": 0.0839, "step": 14100 }, { "epoch": 0.71, "learning_rate": 2.9e-05, "loss": 0.0855, "step": 14200 }, { "epoch": 0.71, "learning_rate": 2.8499999999999998e-05, "loss": 0.0848, "step": 14300 }, { "epoch": 0.72, "learning_rate": 2.8000000000000003e-05, "loss": 0.0889, "step": 14400 }, { "epoch": 0.72, "learning_rate": 2.7500000000000004e-05, "loss": 0.0861, "step": 14500 }, { "epoch": 0.73, "learning_rate": 2.7000000000000002e-05, "loss": 0.0838, "step": 14600 }, { "epoch": 0.73, "learning_rate": 2.6500000000000004e-05, "loss": 0.0878, "step": 14700 }, { "epoch": 0.74, "learning_rate": 2.6000000000000002e-05, "loss": 0.0866, "step": 14800 }, { "epoch": 0.74, "learning_rate": 2.5500000000000003e-05, "loss": 0.086, "step": 14900 }, { "epoch": 0.75, "learning_rate": 2.5e-05, "loss": 0.0841, "step": 15000 }, { "epoch": 0.76, "learning_rate": 2.45e-05, "loss": 0.0864, "step": 15100 }, { "epoch": 0.76, "learning_rate": 2.4e-05, "loss": 0.0855, "step": 15200 }, { "epoch": 0.77, "learning_rate": 2.35e-05, "loss": 0.0861, "step": 15300 }, { "epoch": 0.77, "learning_rate": 2.3000000000000003e-05, "loss": 0.0856, "step": 15400 }, { "epoch": 0.78, "learning_rate": 2.25e-05, "loss": 0.0872, "step": 15500 }, { "epoch": 0.78, "learning_rate": 2.2000000000000003e-05, "loss": 0.0838, "step": 15600 }, { "epoch": 0.79, "learning_rate": 2.15e-05, "loss": 0.0851, "step": 15700 }, { "epoch": 0.79, "learning_rate": 2.1e-05, "loss": 0.0865, "step": 15800 }, { "epoch": 0.8, "learning_rate": 2.05e-05, "loss": 0.0866, "step": 15900 }, { "epoch": 0.8, "learning_rate": 2e-05, "loss": 0.0868, "step": 16000 }, { "epoch": 0.81, "learning_rate": 1.9500000000000003e-05, "loss": 0.087, "step": 16100 }, { "epoch": 0.81, "learning_rate": 1.9e-05, "loss": 0.0844, "step": 16200 }, { "epoch": 0.81, "learning_rate": 1.85e-05, "loss": 0.0835, "step": 16300 }, { "epoch": 0.82, "learning_rate": 1.8e-05, "loss": 0.0863, "step": 16400 }, { "epoch": 0.82, "learning_rate": 1.75e-05, "loss": 0.0854, "step": 16500 }, { "epoch": 0.83, "learning_rate": 1.7000000000000003e-05, "loss": 0.0857, "step": 16600 }, { "epoch": 0.83, "learning_rate": 1.65e-05, "loss": 0.0844, "step": 16700 }, { "epoch": 0.84, "learning_rate": 1.6000000000000003e-05, "loss": 0.0864, "step": 16800 }, { "epoch": 0.84, "learning_rate": 1.55e-05, "loss": 0.0867, "step": 16900 }, { "epoch": 0.85, "learning_rate": 1.5e-05, "loss": 0.0849, "step": 17000 }, { "epoch": 0.85, "learning_rate": 1.45e-05, "loss": 0.0815, "step": 17100 }, { "epoch": 0.86, "learning_rate": 1.4000000000000001e-05, "loss": 0.0823, "step": 17200 }, { "epoch": 0.86, "learning_rate": 1.3500000000000001e-05, "loss": 0.0872, "step": 17300 }, { "epoch": 0.87, "learning_rate": 1.3000000000000001e-05, "loss": 0.084, "step": 17400 }, { "epoch": 0.88, "learning_rate": 1.25e-05, "loss": 0.0812, "step": 17500 }, { "epoch": 0.88, "learning_rate": 1.2e-05, "loss": 0.0842, "step": 17600 }, { "epoch": 0.89, "learning_rate": 1.1500000000000002e-05, "loss": 0.087, "step": 17700 }, { "epoch": 0.89, "learning_rate": 1.1000000000000001e-05, "loss": 0.0843, "step": 17800 }, { "epoch": 0.9, "learning_rate": 1.05e-05, "loss": 0.0833, "step": 17900 }, { "epoch": 0.9, "learning_rate": 1e-05, "loss": 0.0836, "step": 18000 }, { "epoch": 0.91, "learning_rate": 9.5e-06, "loss": 0.0864, "step": 18100 }, { "epoch": 0.91, "learning_rate": 9e-06, "loss": 0.0847, "step": 18200 }, { "epoch": 0.92, "learning_rate": 8.500000000000002e-06, "loss": 0.084, "step": 18300 }, { "epoch": 0.92, "learning_rate": 8.000000000000001e-06, "loss": 0.085, "step": 18400 }, { "epoch": 0.93, "learning_rate": 7.5e-06, "loss": 0.0847, "step": 18500 }, { "epoch": 0.93, "learning_rate": 7.000000000000001e-06, "loss": 0.0847, "step": 18600 }, { "epoch": 0.94, "learning_rate": 6.5000000000000004e-06, "loss": 0.085, "step": 18700 }, { "epoch": 0.94, "learning_rate": 6e-06, "loss": 0.0836, "step": 18800 }, { "epoch": 0.94, "learning_rate": 5.500000000000001e-06, "loss": 0.0838, "step": 18900 }, { "epoch": 0.95, "learning_rate": 5e-06, "loss": 0.0822, "step": 19000 }, { "epoch": 0.95, "learning_rate": 4.5e-06, "loss": 0.0821, "step": 19100 }, { "epoch": 0.96, "learning_rate": 4.000000000000001e-06, "loss": 0.0852, "step": 19200 }, { "epoch": 0.96, "learning_rate": 3.5000000000000004e-06, "loss": 0.0829, "step": 19300 }, { "epoch": 0.97, "learning_rate": 3e-06, "loss": 0.085, "step": 19400 }, { "epoch": 0.97, "learning_rate": 2.5e-06, "loss": 0.0839, "step": 19500 }, { "epoch": 0.98, "learning_rate": 2.0000000000000003e-06, "loss": 0.0829, "step": 19600 }, { "epoch": 0.98, "learning_rate": 1.5e-06, "loss": 0.0849, "step": 19700 }, { "epoch": 0.99, "learning_rate": 1.0000000000000002e-06, "loss": 0.0842, "step": 19800 }, { "epoch": 0.99, "learning_rate": 5.000000000000001e-07, "loss": 0.0837, "step": 19900 }, { "epoch": 1.0, "learning_rate": 0.0, "loss": 0.0797, "step": 20000 }, { "epoch": 1.0, "step": 20000, "total_flos": 0, "train_runtime": 84238.4398, "train_samples_per_second": 0.237 } ], "max_steps": 20000, "num_train_epochs": 1, "total_flos": 0, "trial_name": null, "trial_params": null }