{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9618467457518436, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2.6595744680851064e-05, "loss": 2.0914, "step": 25 }, { "epoch": 0.02, "learning_rate": 5.319148936170213e-05, "loss": 1.8503, "step": 50 }, { "epoch": 0.02, "learning_rate": 7.978723404255319e-05, "loss": 1.3555, "step": 75 }, { "epoch": 0.03, "learning_rate": 0.00010638297872340425, "loss": 1.0528, "step": 100 }, { "epoch": 0.03, "eval_loss": 1.3641345500946045, "eval_runtime": 76.7384, "eval_samples_per_second": 7.089, "eval_steps_per_second": 0.886, "step": 100 }, { "epoch": 0.04, "learning_rate": 0.00013297872340425532, "loss": 0.9497, "step": 125 }, { "epoch": 0.05, "learning_rate": 0.00015957446808510637, "loss": 1.0081, "step": 150 }, { "epoch": 0.06, "learning_rate": 0.00018617021276595746, "loss": 0.9365, "step": 175 }, { "epoch": 0.06, "learning_rate": 0.00019999805857737166, "loss": 0.9364, "step": 200 }, { "epoch": 0.06, "eval_loss": 1.2751965522766113, "eval_runtime": 76.7874, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.886, "step": 200 }, { "epoch": 0.07, "learning_rate": 0.0001999815435109719, "loss": 0.9226, "step": 225 }, { "epoch": 0.08, "learning_rate": 0.00019994817911010517, "loss": 0.8396, "step": 250 }, { "epoch": 0.09, "learning_rate": 0.0001998979709974887, "loss": 0.8687, "step": 275 }, { "epoch": 0.1, "learning_rate": 0.00019983092763441697, "loss": 0.8593, "step": 300 }, { "epoch": 0.1, "eval_loss": 1.2590458393096924, "eval_runtime": 76.8321, "eval_samples_per_second": 7.08, "eval_steps_per_second": 0.885, "step": 300 }, { "epoch": 0.1, "learning_rate": 0.0001997470603193358, "loss": 0.8064, "step": 325 }, { "epoch": 0.11, "learning_rate": 0.0001996463831859381, "loss": 0.7851, "step": 350 }, { "epoch": 0.12, "learning_rate": 0.00019952891320078236, "loss": 0.8516, "step": 375 }, { "epoch": 0.13, "learning_rate": 0.0001993946701604329, "loss": 0.8161, "step": 400 }, { "epoch": 0.13, "eval_loss": 1.1581201553344727, "eval_runtime": 76.7611, "eval_samples_per_second": 7.087, "eval_steps_per_second": 0.886, "step": 400 }, { "epoch": 0.14, "learning_rate": 0.00019924367668812417, "loss": 0.838, "step": 425 }, { "epoch": 0.14, "learning_rate": 0.00019907595822994773, "loss": 0.8271, "step": 450 }, { "epoch": 0.15, "learning_rate": 0.00019889154305056426, "loss": 0.8005, "step": 475 }, { "epoch": 0.16, "learning_rate": 0.00019869046222844008, "loss": 0.7916, "step": 500 }, { "epoch": 0.16, "eval_loss": 1.1132324934005737, "eval_runtime": 76.7669, "eval_samples_per_second": 7.086, "eval_steps_per_second": 0.886, "step": 500 }, { "epoch": 0.17, "learning_rate": 0.00019847274965060984, "loss": 0.7991, "step": 525 }, { "epoch": 0.18, "learning_rate": 0.00019823844200696564, "loss": 0.8387, "step": 550 }, { "epoch": 0.18, "learning_rate": 0.00019798757878407386, "loss": 0.8196, "step": 575 }, { "epoch": 0.19, "learning_rate": 0.00019772020225852077, "loss": 0.7508, "step": 600 }, { "epoch": 0.19, "eval_loss": 1.0564008951187134, "eval_runtime": 76.7274, "eval_samples_per_second": 7.09, "eval_steps_per_second": 0.886, "step": 600 }, { "epoch": 0.2, "learning_rate": 0.0001974363574897878, "loss": 0.8356, "step": 625 }, { "epoch": 0.21, "learning_rate": 0.00019713609231265805, "loss": 0.7036, "step": 650 }, { "epoch": 0.22, "learning_rate": 0.0001968194573291549, "loss": 0.8082, "step": 675 }, { "epoch": 0.22, "learning_rate": 0.00019648650590001418, "loss": 0.7974, "step": 700 }, { "epoch": 0.22, "eval_loss": 1.1469172239303589, "eval_runtime": 76.886, "eval_samples_per_second": 7.075, "eval_steps_per_second": 0.884, "step": 700 }, { "epoch": 0.23, "learning_rate": 0.00019613729413569194, "loss": 0.7804, "step": 725 }, { "epoch": 0.24, "learning_rate": 0.00019577188088690803, "loss": 0.7398, "step": 750 }, { "epoch": 0.25, "learning_rate": 0.00019539032773472866, "loss": 0.7178, "step": 775 }, { "epoch": 0.26, "learning_rate": 0.00019499269898018832, "loss": 0.7303, "step": 800 }, { "epoch": 0.26, "eval_loss": 1.1135411262512207, "eval_runtime": 76.7509, "eval_samples_per_second": 7.088, "eval_steps_per_second": 0.886, "step": 800 }, { "epoch": 0.26, "learning_rate": 0.00019457906163345353, "loss": 0.7133, "step": 825 }, { "epoch": 0.27, "learning_rate": 0.00019414948540253, "loss": 0.744, "step": 850 }, { "epoch": 0.28, "learning_rate": 0.000193704042681515, "loss": 0.7195, "step": 875 }, { "epoch": 0.29, "learning_rate": 0.00019324280853839744, "loss": 0.6909, "step": 900 }, { "epoch": 0.29, "eval_loss": 1.1239628791809082, "eval_runtime": 76.7964, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.885, "step": 900 }, { "epoch": 0.3, "learning_rate": 0.00019276586070240682, "loss": 0.7061, "step": 925 }, { "epoch": 0.3, "learning_rate": 0.00019227327955091412, "loss": 0.694, "step": 950 }, { "epoch": 0.31, "learning_rate": 0.00019176514809588616, "loss": 0.707, "step": 975 }, { "epoch": 0.32, "learning_rate": 0.0001912415519698961, "loss": 0.7267, "step": 1000 }, { "epoch": 0.32, "eval_loss": 1.0490331649780273, "eval_runtime": 76.7671, "eval_samples_per_second": 7.086, "eval_steps_per_second": 0.886, "step": 1000 }, { "epoch": 0.33, "learning_rate": 0.00019070257941169224, "loss": 0.694, "step": 1025 }, { "epoch": 0.34, "learning_rate": 0.00019014832125132763, "loss": 0.638, "step": 1050 }, { "epoch": 0.34, "learning_rate": 0.000189578870894853, "loss": 0.6951, "step": 1075 }, { "epoch": 0.35, "learning_rate": 0.00018899432430857552, "loss": 0.6649, "step": 1100 }, { "epoch": 0.35, "eval_loss": 1.127469539642334, "eval_runtime": 76.7816, "eval_samples_per_second": 7.085, "eval_steps_per_second": 0.886, "step": 1100 }, { "epoch": 0.36, "learning_rate": 0.00018839478000288627, "loss": 0.7284, "step": 1125 }, { "epoch": 0.37, "learning_rate": 0.00018778033901565856, "loss": 0.6459, "step": 1150 }, { "epoch": 0.38, "learning_rate": 0.0001871511048952208, "loss": 0.6616, "step": 1175 }, { "epoch": 0.38, "learning_rate": 0.0001865071836829061, "loss": 0.6306, "step": 1200 }, { "epoch": 0.38, "eval_loss": 1.0801633596420288, "eval_runtime": 76.7811, "eval_samples_per_second": 7.085, "eval_steps_per_second": 0.886, "step": 1200 }, { "epoch": 0.39, "learning_rate": 0.00018584868389518152, "loss": 0.7158, "step": 1225 }, { "epoch": 0.4, "learning_rate": 0.00018517571650536056, "loss": 0.601, "step": 1250 }, { "epoch": 0.41, "learning_rate": 0.0001844883949249013, "loss": 0.6362, "step": 1275 }, { "epoch": 0.42, "learning_rate": 0.00018378683498429402, "loss": 0.5795, "step": 1300 }, { "epoch": 0.42, "eval_loss": 1.1198811531066895, "eval_runtime": 76.7858, "eval_samples_per_second": 7.085, "eval_steps_per_second": 0.886, "step": 1300 }, { "epoch": 0.42, "learning_rate": 0.00018307115491354064, "loss": 0.6037, "step": 1325 }, { "epoch": 0.43, "learning_rate": 0.0001823414753222303, "loss": 0.5866, "step": 1350 }, { "epoch": 0.44, "learning_rate": 0.00018159791917921362, "loss": 0.6389, "step": 1375 }, { "epoch": 0.45, "learning_rate": 0.00018084061179187946, "loss": 0.6039, "step": 1400 }, { "epoch": 0.45, "eval_loss": 1.1011444330215454, "eval_runtime": 76.8374, "eval_samples_per_second": 7.08, "eval_steps_per_second": 0.885, "step": 1400 }, { "epoch": 0.46, "learning_rate": 0.00018006968078503746, "loss": 0.5734, "step": 1425 }, { "epoch": 0.46, "learning_rate": 0.0001792852560794103, "loss": 0.565, "step": 1450 }, { "epoch": 0.47, "learning_rate": 0.00017848746986973883, "loss": 0.6221, "step": 1475 }, { "epoch": 0.48, "learning_rate": 0.00017767645660250384, "loss": 0.6266, "step": 1500 }, { "epoch": 0.48, "eval_loss": 1.0841057300567627, "eval_runtime": 76.7968, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.885, "step": 1500 }, { "epoch": 0.49, "learning_rate": 0.00017685235295326893, "loss": 0.5798, "step": 1525 }, { "epoch": 0.5, "learning_rate": 0.00017601529780364688, "loss": 0.5628, "step": 1550 }, { "epoch": 0.5, "learning_rate": 0.0001751654322178951, "loss": 0.5943, "step": 1575 }, { "epoch": 0.51, "learning_rate": 0.00017430289941914258, "loss": 0.57, "step": 1600 }, { "epoch": 0.51, "eval_loss": 1.067173957824707, "eval_runtime": 76.7443, "eval_samples_per_second": 7.088, "eval_steps_per_second": 0.886, "step": 1600 }, { "epoch": 0.52, "learning_rate": 0.00017342784476525347, "loss": 0.6021, "step": 1625 }, { "epoch": 0.53, "learning_rate": 0.00017254041572433058, "loss": 0.5314, "step": 1650 }, { "epoch": 0.54, "learning_rate": 0.00017164076184986345, "loss": 0.5162, "step": 1675 }, { "epoch": 0.55, "learning_rate": 0.00017072903475552503, "loss": 0.5693, "step": 1700 }, { "epoch": 0.55, "eval_loss": 1.059133768081665, "eval_runtime": 76.8125, "eval_samples_per_second": 7.082, "eval_steps_per_second": 0.885, "step": 1700 }, { "epoch": 0.55, "learning_rate": 0.00016980538808962102, "loss": 0.5466, "step": 1725 }, { "epoch": 0.56, "learning_rate": 0.00016886997750919619, "loss": 0.4934, "step": 1750 }, { "epoch": 0.57, "learning_rate": 0.0001679229606538026, "loss": 0.5666, "step": 1775 }, { "epoch": 0.58, "learning_rate": 0.00016696449711893344, "loss": 0.4821, "step": 1800 }, { "epoch": 0.58, "eval_loss": 1.0875073671340942, "eval_runtime": 76.8354, "eval_samples_per_second": 7.08, "eval_steps_per_second": 0.885, "step": 1800 }, { "epoch": 0.59, "learning_rate": 0.00016599474842912722, "loss": 0.5816, "step": 1825 }, { "epoch": 0.59, "learning_rate": 0.0001650138780107469, "loss": 0.5616, "step": 1850 }, { "epoch": 0.6, "learning_rate": 0.00016402205116443868, "loss": 0.5239, "step": 1875 }, { "epoch": 0.61, "learning_rate": 0.00016301943503727468, "loss": 0.4953, "step": 1900 }, { "epoch": 0.61, "eval_loss": 1.0953130722045898, "eval_runtime": 76.7571, "eval_samples_per_second": 7.087, "eval_steps_per_second": 0.886, "step": 1900 }, { "epoch": 0.62, "learning_rate": 0.00016200619859458467, "loss": 0.5118, "step": 1925 }, { "epoch": 0.63, "learning_rate": 0.00016098251259148125, "loss": 0.5194, "step": 1950 }, { "epoch": 0.63, "learning_rate": 0.00015994854954408337, "loss": 0.5503, "step": 1975 }, { "epoch": 0.64, "learning_rate": 0.00015890448370044317, "loss": 0.4673, "step": 2000 }, { "epoch": 0.64, "eval_loss": 1.1145678758621216, "eval_runtime": 76.7971, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.885, "step": 2000 }, { "epoch": 0.65, "learning_rate": 0.0001578504910111811, "loss": 0.5086, "step": 2025 }, { "epoch": 0.66, "learning_rate": 0.00015678674909983364, "loss": 0.4686, "step": 2050 }, { "epoch": 0.67, "learning_rate": 0.00015571343723291963, "loss": 0.5025, "step": 2075 }, { "epoch": 0.67, "learning_rate": 0.00015463073628972936, "loss": 0.5223, "step": 2100 }, { "epoch": 0.67, "eval_loss": 1.075910210609436, "eval_runtime": 76.8054, "eval_samples_per_second": 7.083, "eval_steps_per_second": 0.885, "step": 2100 }, { "epoch": 0.68, "learning_rate": 0.000153538828731842, "loss": 0.4457, "step": 2125 }, { "epoch": 0.69, "learning_rate": 0.00015243789857237645, "loss": 0.471, "step": 2150 }, { "epoch": 0.7, "learning_rate": 0.00015132813134498054, "loss": 0.4692, "step": 2175 }, { "epoch": 0.71, "learning_rate": 0.000150209714072564, "loss": 0.4661, "step": 2200 }, { "epoch": 0.71, "eval_loss": 1.0707464218139648, "eval_runtime": 76.7556, "eval_samples_per_second": 7.087, "eval_steps_per_second": 0.886, "step": 2200 }, { "epoch": 0.71, "learning_rate": 0.00014908283523578077, "loss": 0.442, "step": 2225 }, { "epoch": 0.72, "learning_rate": 0.00014794768474126518, "loss": 0.5075, "step": 2250 }, { "epoch": 0.73, "learning_rate": 0.00014680445388962814, "loss": 0.4969, "step": 2275 }, { "epoch": 0.74, "learning_rate": 0.00014565333534321826, "loss": 0.4642, "step": 2300 }, { "epoch": 0.74, "eval_loss": 1.0463252067565918, "eval_runtime": 76.7954, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.885, "step": 2300 }, { "epoch": 0.75, "learning_rate": 0.0001444945230936536, "loss": 0.4412, "step": 2325 }, { "epoch": 0.75, "learning_rate": 0.00014332821242912927, "loss": 0.4657, "step": 2350 }, { "epoch": 0.76, "learning_rate": 0.0001421545999015066, "loss": 0.4311, "step": 2375 }, { "epoch": 0.77, "learning_rate": 0.00014097388329318943, "loss": 0.45, "step": 2400 }, { "epoch": 0.77, "eval_loss": 1.076710820198059, "eval_runtime": 76.7708, "eval_samples_per_second": 7.086, "eval_steps_per_second": 0.886, "step": 2400 }, { "epoch": 0.78, "learning_rate": 0.00013978626158379285, "loss": 0.4336, "step": 2425 }, { "epoch": 0.79, "learning_rate": 0.00013859193491661036, "loss": 0.4002, "step": 2450 }, { "epoch": 0.79, "learning_rate": 0.0001373911045648846, "loss": 0.4141, "step": 2475 }, { "epoch": 0.8, "learning_rate": 0.00013618397289788832, "loss": 0.452, "step": 2500 }, { "epoch": 0.8, "eval_loss": 1.085994005203247, "eval_runtime": 76.8556, "eval_samples_per_second": 7.078, "eval_steps_per_second": 0.885, "step": 2500 }, { "epoch": 0.81, "learning_rate": 0.00013497074334681966, "loss": 0.3574, "step": 2525 }, { "epoch": 0.82, "learning_rate": 0.00013375162037051954, "loss": 0.4107, "step": 2550 }, { "epoch": 0.83, "learning_rate": 0.000132526809421015, "loss": 0.4616, "step": 2575 }, { "epoch": 0.83, "learning_rate": 0.0001312965169088957, "loss": 0.4397, "step": 2600 }, { "epoch": 0.83, "eval_loss": 1.088114619255066, "eval_runtime": 76.7923, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.886, "step": 2600 }, { "epoch": 0.84, "learning_rate": 0.00013006095016852848, "loss": 0.3993, "step": 2625 }, { "epoch": 0.85, "learning_rate": 0.0001288203174231168, "loss": 0.3973, "step": 2650 }, { "epoch": 0.86, "learning_rate": 0.00012757482774960964, "loss": 0.3843, "step": 2675 }, { "epoch": 0.87, "learning_rate": 0.00012632469104346722, "loss": 0.3932, "step": 2700 }, { "epoch": 0.87, "eval_loss": 1.0900908708572388, "eval_runtime": 76.8201, "eval_samples_per_second": 7.081, "eval_steps_per_second": 0.885, "step": 2700 }, { "epoch": 0.87, "learning_rate": 0.00012507011798328823, "loss": 0.367, "step": 2725 }, { "epoch": 0.88, "learning_rate": 0.00012381131999530563, "loss": 0.4149, "step": 2750 }, { "epoch": 0.89, "learning_rate": 0.000122548509217756, "loss": 0.3864, "step": 2775 }, { "epoch": 0.9, "learning_rate": 0.00012128189846512894, "loss": 0.3654, "step": 2800 }, { "epoch": 0.9, "eval_loss": 1.0753364562988281, "eval_runtime": 76.8365, "eval_samples_per_second": 7.08, "eval_steps_per_second": 0.885, "step": 2800 }, { "epoch": 0.91, "learning_rate": 0.00012001170119230286, "loss": 0.387, "step": 2825 }, { "epoch": 0.91, "learning_rate": 0.00011873813145857249, "loss": 0.3551, "step": 2850 }, { "epoch": 0.92, "learning_rate": 0.00011746140389157462, "loss": 0.385, "step": 2875 }, { "epoch": 0.93, "learning_rate": 0.00011618173365111824, "loss": 0.3227, "step": 2900 }, { "epoch": 0.93, "eval_loss": 1.0998799800872803, "eval_runtime": 76.6275, "eval_samples_per_second": 7.099, "eval_steps_per_second": 0.887, "step": 2900 }, { "epoch": 0.94, "learning_rate": 0.00011489933639292469, "loss": 0.3459, "step": 2925 }, { "epoch": 0.95, "learning_rate": 0.00011361442823228453, "loss": 0.3505, "step": 2950 }, { "epoch": 0.95, "learning_rate": 0.00011232722570763667, "loss": 0.3644, "step": 2975 }, { "epoch": 0.96, "learning_rate": 0.00011103794574407657, "loss": 0.3238, "step": 3000 }, { "epoch": 0.96, "eval_loss": 1.1243908405303955, "eval_runtime": 76.7936, "eval_samples_per_second": 7.084, "eval_steps_per_second": 0.885, "step": 3000 } ], "max_steps": 6238, "num_train_epochs": 2, "total_flos": 6.802205860248576e+16, "trial_name": null, "trial_params": null }