{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "global_step": 102820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 0.0001, "loss": 2.9981, "step": 500 }, { "epoch": 0.05, "eval_loss": 0.7217285633087158, "eval_runtime": 1.4105, "eval_samples_per_second": 708.956, "eval_steps_per_second": 22.687, "step": 500 }, { "epoch": 0.1, "learning_rate": 9.95113369820172e-05, "loss": 0.3596, "step": 1000 }, { "epoch": 0.1, "eval_loss": 0.683592677116394, "eval_runtime": 1.425, "eval_samples_per_second": 701.743, "eval_steps_per_second": 22.456, "step": 1000 }, { "epoch": 0.15, "learning_rate": 9.90226739640344e-05, "loss": 0.3481, "step": 1500 }, { "epoch": 0.15, "eval_loss": 0.6578707695007324, "eval_runtime": 1.4239, "eval_samples_per_second": 702.301, "eval_steps_per_second": 22.474, "step": 1500 }, { "epoch": 0.19, "learning_rate": 9.85340109460516e-05, "loss": 0.3381, "step": 2000 }, { "epoch": 0.19, "eval_loss": 0.6479542255401611, "eval_runtime": 1.42, "eval_samples_per_second": 704.242, "eval_steps_per_second": 22.536, "step": 2000 }, { "epoch": 0.24, "learning_rate": 9.80453479280688e-05, "loss": 0.3289, "step": 2500 }, { "epoch": 0.24, "eval_loss": 0.6387728452682495, "eval_runtime": 1.4329, "eval_samples_per_second": 697.89, "eval_steps_per_second": 22.332, "step": 2500 }, { "epoch": 0.29, "learning_rate": 9.7556684910086e-05, "loss": 0.3279, "step": 3000 }, { "epoch": 0.29, "eval_loss": 0.6298181414604187, "eval_runtime": 1.4061, "eval_samples_per_second": 711.206, "eval_steps_per_second": 22.759, "step": 3000 }, { "epoch": 0.34, "learning_rate": 9.706802189210322e-05, "loss": 0.3217, "step": 3500 }, { "epoch": 0.34, "eval_loss": 0.62165367603302, "eval_runtime": 1.421, "eval_samples_per_second": 703.748, "eval_steps_per_second": 22.52, "step": 3500 }, { "epoch": 0.39, "learning_rate": 9.65793588741204e-05, "loss": 0.3165, "step": 4000 }, { "epoch": 0.39, "eval_loss": 0.616775393486023, "eval_runtime": 1.4244, "eval_samples_per_second": 702.031, "eval_steps_per_second": 22.465, "step": 4000 }, { "epoch": 0.44, "learning_rate": 9.609069585613761e-05, "loss": 0.3188, "step": 4500 }, { "epoch": 0.44, "eval_loss": 0.6099900007247925, "eval_runtime": 1.4195, "eval_samples_per_second": 704.489, "eval_steps_per_second": 22.544, "step": 4500 }, { "epoch": 0.49, "learning_rate": 9.560203283815481e-05, "loss": 0.3022, "step": 5000 }, { "epoch": 0.49, "eval_loss": 0.6031085252761841, "eval_runtime": 1.4165, "eval_samples_per_second": 705.968, "eval_steps_per_second": 22.591, "step": 5000 }, { "epoch": 0.53, "learning_rate": 9.511336982017201e-05, "loss": 0.3167, "step": 5500 }, { "epoch": 0.53, "eval_loss": 0.5986317992210388, "eval_runtime": 1.413, "eval_samples_per_second": 707.704, "eval_steps_per_second": 22.647, "step": 5500 }, { "epoch": 0.58, "learning_rate": 9.462470680218921e-05, "loss": 0.3123, "step": 6000 }, { "epoch": 0.58, "eval_loss": 0.594712495803833, "eval_runtime": 1.4076, "eval_samples_per_second": 710.451, "eval_steps_per_second": 22.734, "step": 6000 }, { "epoch": 0.63, "learning_rate": 9.413604378420641e-05, "loss": 0.3102, "step": 6500 }, { "epoch": 0.63, "eval_loss": 0.589926540851593, "eval_runtime": 1.4215, "eval_samples_per_second": 703.5, "eval_steps_per_second": 22.512, "step": 6500 }, { "epoch": 0.68, "learning_rate": 9.364738076622361e-05, "loss": 0.3029, "step": 7000 }, { "epoch": 0.68, "eval_loss": 0.5852935910224915, "eval_runtime": 1.4145, "eval_samples_per_second": 706.961, "eval_steps_per_second": 22.623, "step": 7000 }, { "epoch": 0.73, "learning_rate": 9.315871774824082e-05, "loss": 0.2999, "step": 7500 }, { "epoch": 0.73, "eval_loss": 0.5810648798942566, "eval_runtime": 1.4374, "eval_samples_per_second": 695.724, "eval_steps_per_second": 22.263, "step": 7500 }, { "epoch": 0.78, "learning_rate": 9.267005473025801e-05, "loss": 0.2898, "step": 8000 }, { "epoch": 0.78, "eval_loss": 0.5774234533309937, "eval_runtime": 1.422, "eval_samples_per_second": 703.256, "eval_steps_per_second": 22.504, "step": 8000 }, { "epoch": 0.83, "learning_rate": 9.218139171227522e-05, "loss": 0.2924, "step": 8500 }, { "epoch": 0.83, "eval_loss": 0.5741690993309021, "eval_runtime": 1.4369, "eval_samples_per_second": 695.963, "eval_steps_per_second": 22.271, "step": 8500 }, { "epoch": 0.88, "learning_rate": 9.169272869429242e-05, "loss": 0.2965, "step": 9000 }, { "epoch": 0.88, "eval_loss": 0.5683675408363342, "eval_runtime": 1.4175, "eval_samples_per_second": 705.474, "eval_steps_per_second": 22.575, "step": 9000 }, { "epoch": 0.92, "learning_rate": 9.120406567630962e-05, "loss": 0.3003, "step": 9500 }, { "epoch": 0.92, "eval_loss": 0.5660465359687805, "eval_runtime": 1.4225, "eval_samples_per_second": 703.011, "eval_steps_per_second": 22.496, "step": 9500 }, { "epoch": 0.97, "learning_rate": 9.071540265832682e-05, "loss": 0.2877, "step": 10000 }, { "epoch": 0.97, "eval_loss": 0.5636941194534302, "eval_runtime": 1.4249, "eval_samples_per_second": 701.786, "eval_steps_per_second": 22.457, "step": 10000 }, { "epoch": 1.02, "learning_rate": 9.022673964034402e-05, "loss": 0.28, "step": 10500 }, { "epoch": 1.02, "eval_loss": 0.5614505410194397, "eval_runtime": 1.418, "eval_samples_per_second": 705.226, "eval_steps_per_second": 22.567, "step": 10500 }, { "epoch": 1.07, "learning_rate": 8.973807662236122e-05, "loss": 0.2596, "step": 11000 }, { "epoch": 1.07, "eval_loss": 0.560372531414032, "eval_runtime": 1.419, "eval_samples_per_second": 704.735, "eval_steps_per_second": 22.552, "step": 11000 }, { "epoch": 1.12, "learning_rate": 8.924941360437843e-05, "loss": 0.2629, "step": 11500 }, { "epoch": 1.12, "eval_loss": 0.5570399165153503, "eval_runtime": 1.422, "eval_samples_per_second": 703.254, "eval_steps_per_second": 22.504, "step": 11500 }, { "epoch": 1.17, "learning_rate": 8.876075058639562e-05, "loss": 0.2588, "step": 12000 }, { "epoch": 1.17, "eval_loss": 0.5555282831192017, "eval_runtime": 1.4115, "eval_samples_per_second": 708.456, "eval_steps_per_second": 22.671, "step": 12000 }, { "epoch": 1.22, "learning_rate": 8.827208756841283e-05, "loss": 0.2623, "step": 12500 }, { "epoch": 1.22, "eval_loss": 0.5514973402023315, "eval_runtime": 1.4195, "eval_samples_per_second": 704.486, "eval_steps_per_second": 22.544, "step": 12500 }, { "epoch": 1.26, "learning_rate": 8.778342455043003e-05, "loss": 0.2553, "step": 13000 }, { "epoch": 1.26, "eval_loss": 0.5486002564430237, "eval_runtime": 1.42, "eval_samples_per_second": 704.24, "eval_steps_per_second": 22.536, "step": 13000 }, { "epoch": 1.31, "learning_rate": 8.729476153244723e-05, "loss": 0.262, "step": 13500 }, { "epoch": 1.31, "eval_loss": 0.5457084774971008, "eval_runtime": 1.4135, "eval_samples_per_second": 707.481, "eval_steps_per_second": 22.639, "step": 13500 }, { "epoch": 1.36, "learning_rate": 8.680609851446443e-05, "loss": 0.2613, "step": 14000 }, { "epoch": 1.36, "eval_loss": 0.5417377948760986, "eval_runtime": 1.4156, "eval_samples_per_second": 706.429, "eval_steps_per_second": 22.606, "step": 14000 }, { "epoch": 1.41, "learning_rate": 8.631743549648163e-05, "loss": 0.2679, "step": 14500 }, { "epoch": 1.41, "eval_loss": 0.5402753949165344, "eval_runtime": 1.4211, "eval_samples_per_second": 703.665, "eval_steps_per_second": 22.517, "step": 14500 }, { "epoch": 1.46, "learning_rate": 8.582877247849883e-05, "loss": 0.2537, "step": 15000 }, { "epoch": 1.46, "eval_loss": 0.5380659699440002, "eval_runtime": 1.4234, "eval_samples_per_second": 702.521, "eval_steps_per_second": 22.481, "step": 15000 }, { "epoch": 1.51, "learning_rate": 8.534010946051603e-05, "loss": 0.2502, "step": 15500 }, { "epoch": 1.51, "eval_loss": 0.532864511013031, "eval_runtime": 1.4508, "eval_samples_per_second": 689.292, "eval_steps_per_second": 22.057, "step": 15500 }, { "epoch": 1.56, "learning_rate": 8.485144644253323e-05, "loss": 0.2594, "step": 16000 }, { "epoch": 1.56, "eval_loss": 0.5308486223220825, "eval_runtime": 1.4175, "eval_samples_per_second": 705.473, "eval_steps_per_second": 22.575, "step": 16000 }, { "epoch": 1.6, "learning_rate": 8.436278342455043e-05, "loss": 0.2495, "step": 16500 }, { "epoch": 1.6, "eval_loss": 0.5297316312789917, "eval_runtime": 1.416, "eval_samples_per_second": 706.219, "eval_steps_per_second": 22.599, "step": 16500 }, { "epoch": 1.65, "learning_rate": 8.387412040656764e-05, "loss": 0.2499, "step": 17000 }, { "epoch": 1.65, "eval_loss": 0.5281020402908325, "eval_runtime": 1.4056, "eval_samples_per_second": 711.457, "eval_steps_per_second": 22.767, "step": 17000 }, { "epoch": 1.7, "learning_rate": 8.338545738858483e-05, "loss": 0.2578, "step": 17500 }, { "epoch": 1.7, "eval_loss": 0.5247856378555298, "eval_runtime": 1.4135, "eval_samples_per_second": 707.459, "eval_steps_per_second": 22.639, "step": 17500 }, { "epoch": 1.75, "learning_rate": 8.289679437060204e-05, "loss": 0.2497, "step": 18000 }, { "epoch": 1.75, "eval_loss": 0.5230081677436829, "eval_runtime": 1.4438, "eval_samples_per_second": 692.611, "eval_steps_per_second": 22.164, "step": 18000 }, { "epoch": 1.8, "learning_rate": 8.240813135261924e-05, "loss": 0.2565, "step": 18500 }, { "epoch": 1.8, "eval_loss": 0.5200880765914917, "eval_runtime": 1.4398, "eval_samples_per_second": 694.522, "eval_steps_per_second": 22.225, "step": 18500 }, { "epoch": 1.85, "learning_rate": 8.191946833463644e-05, "loss": 0.2523, "step": 19000 }, { "epoch": 1.85, "eval_loss": 0.5170234441757202, "eval_runtime": 1.4299, "eval_samples_per_second": 699.349, "eval_steps_per_second": 22.379, "step": 19000 }, { "epoch": 1.9, "learning_rate": 8.143080531665364e-05, "loss": 0.2501, "step": 19500 }, { "epoch": 1.9, "eval_loss": 0.5144360065460205, "eval_runtime": 1.4239, "eval_samples_per_second": 702.275, "eval_steps_per_second": 22.473, "step": 19500 }, { "epoch": 1.95, "learning_rate": 8.094214229867084e-05, "loss": 0.2488, "step": 20000 }, { "epoch": 1.95, "eval_loss": 0.5127125382423401, "eval_runtime": 1.4244, "eval_samples_per_second": 702.027, "eval_steps_per_second": 22.465, "step": 20000 }, { "epoch": 1.99, "learning_rate": 8.045347928068804e-05, "loss": 0.2465, "step": 20500 }, { "epoch": 1.99, "eval_loss": 0.511444091796875, "eval_runtime": 1.422, "eval_samples_per_second": 703.257, "eval_steps_per_second": 22.504, "step": 20500 }, { "epoch": 2.04, "learning_rate": 7.996481626270525e-05, "loss": 0.2282, "step": 21000 }, { "epoch": 2.04, "eval_loss": 0.512248158454895, "eval_runtime": 1.419, "eval_samples_per_second": 704.734, "eval_steps_per_second": 22.551, "step": 21000 }, { "epoch": 2.09, "learning_rate": 7.947615324472244e-05, "loss": 0.2251, "step": 21500 }, { "epoch": 2.09, "eval_loss": 0.5102840065956116, "eval_runtime": 1.414, "eval_samples_per_second": 707.209, "eval_steps_per_second": 22.631, "step": 21500 }, { "epoch": 2.14, "learning_rate": 7.898749022673965e-05, "loss": 0.2172, "step": 22000 }, { "epoch": 2.14, "eval_loss": 0.5100817680358887, "eval_runtime": 1.4279, "eval_samples_per_second": 700.321, "eval_steps_per_second": 22.41, "step": 22000 }, { "epoch": 2.19, "learning_rate": 7.849882720875685e-05, "loss": 0.2143, "step": 22500 }, { "epoch": 2.19, "eval_loss": 0.509198784828186, "eval_runtime": 1.4398, "eval_samples_per_second": 694.524, "eval_steps_per_second": 22.225, "step": 22500 }, { "epoch": 2.24, "learning_rate": 7.801016419077405e-05, "loss": 0.2215, "step": 23000 }, { "epoch": 2.24, "eval_loss": 0.5054255127906799, "eval_runtime": 1.4289, "eval_samples_per_second": 699.834, "eval_steps_per_second": 22.395, "step": 23000 }, { "epoch": 2.29, "learning_rate": 7.752150117279125e-05, "loss": 0.2175, "step": 23500 }, { "epoch": 2.29, "eval_loss": 0.5042800307273865, "eval_runtime": 1.4234, "eval_samples_per_second": 702.521, "eval_steps_per_second": 22.481, "step": 23500 }, { "epoch": 2.33, "learning_rate": 7.703283815480845e-05, "loss": 0.2181, "step": 24000 }, { "epoch": 2.33, "eval_loss": 0.5006986260414124, "eval_runtime": 1.419, "eval_samples_per_second": 704.732, "eval_steps_per_second": 22.551, "step": 24000 }, { "epoch": 2.38, "learning_rate": 7.654417513682565e-05, "loss": 0.2229, "step": 24500 }, { "epoch": 2.38, "eval_loss": 0.49812304973602295, "eval_runtime": 1.4369, "eval_samples_per_second": 695.962, "eval_steps_per_second": 22.271, "step": 24500 }, { "epoch": 2.43, "learning_rate": 7.605551211884286e-05, "loss": 0.2103, "step": 25000 }, { "epoch": 2.43, "eval_loss": 0.49662986397743225, "eval_runtime": 1.4066, "eval_samples_per_second": 710.955, "eval_steps_per_second": 22.751, "step": 25000 }, { "epoch": 2.48, "learning_rate": 7.556684910086005e-05, "loss": 0.2195, "step": 25500 }, { "epoch": 2.48, "eval_loss": 0.4949464499950409, "eval_runtime": 1.4145, "eval_samples_per_second": 706.961, "eval_steps_per_second": 22.623, "step": 25500 }, { "epoch": 2.53, "learning_rate": 7.507818608287726e-05, "loss": 0.2197, "step": 26000 }, { "epoch": 2.53, "eval_loss": 0.49265730381011963, "eval_runtime": 1.4441, "eval_samples_per_second": 692.484, "eval_steps_per_second": 22.159, "step": 26000 }, { "epoch": 2.58, "learning_rate": 7.458952306489444e-05, "loss": 0.2163, "step": 26500 }, { "epoch": 2.58, "eval_loss": 0.4933662414550781, "eval_runtime": 1.4196, "eval_samples_per_second": 704.41, "eval_steps_per_second": 22.541, "step": 26500 }, { "epoch": 2.63, "learning_rate": 7.410086004691166e-05, "loss": 0.2203, "step": 27000 }, { "epoch": 2.63, "eval_loss": 0.4912818670272827, "eval_runtime": 1.4225, "eval_samples_per_second": 703.011, "eval_steps_per_second": 22.496, "step": 27000 }, { "epoch": 2.67, "learning_rate": 7.361219702892886e-05, "loss": 0.2131, "step": 27500 }, { "epoch": 2.67, "eval_loss": 0.49019622802734375, "eval_runtime": 1.4165, "eval_samples_per_second": 705.97, "eval_steps_per_second": 22.591, "step": 27500 }, { "epoch": 2.72, "learning_rate": 7.312353401094606e-05, "loss": 0.2192, "step": 28000 }, { "epoch": 2.72, "eval_loss": 0.48748642206192017, "eval_runtime": 1.419, "eval_samples_per_second": 704.732, "eval_steps_per_second": 22.551, "step": 28000 }, { "epoch": 2.77, "learning_rate": 7.263487099296326e-05, "loss": 0.216, "step": 28500 }, { "epoch": 2.77, "eval_loss": 0.4867847263813019, "eval_runtime": 1.4078, "eval_samples_per_second": 710.341, "eval_steps_per_second": 22.731, "step": 28500 }, { "epoch": 2.82, "learning_rate": 7.214620797498047e-05, "loss": 0.2151, "step": 29000 }, { "epoch": 2.82, "eval_loss": 0.4847819209098816, "eval_runtime": 1.4175, "eval_samples_per_second": 705.473, "eval_steps_per_second": 22.575, "step": 29000 }, { "epoch": 2.87, "learning_rate": 7.165754495699765e-05, "loss": 0.2134, "step": 29500 }, { "epoch": 2.87, "eval_loss": 0.48309269547462463, "eval_runtime": 1.4264, "eval_samples_per_second": 701.053, "eval_steps_per_second": 22.434, "step": 29500 }, { "epoch": 2.92, "learning_rate": 7.116888193901487e-05, "loss": 0.215, "step": 30000 }, { "epoch": 2.92, "eval_loss": 0.4808345437049866, "eval_runtime": 1.4354, "eval_samples_per_second": 696.688, "eval_steps_per_second": 22.294, "step": 30000 }, { "epoch": 2.97, "learning_rate": 7.068021892103205e-05, "loss": 0.2149, "step": 30500 }, { "epoch": 2.97, "eval_loss": 0.48003917932510376, "eval_runtime": 1.4294, "eval_samples_per_second": 699.594, "eval_steps_per_second": 22.387, "step": 30500 }, { "epoch": 3.01, "learning_rate": 7.019155590304925e-05, "loss": 0.2081, "step": 31000 }, { "epoch": 3.01, "eval_loss": 0.4828941524028778, "eval_runtime": 1.4264, "eval_samples_per_second": 701.051, "eval_steps_per_second": 22.434, "step": 31000 }, { "epoch": 3.06, "learning_rate": 6.970289288506647e-05, "loss": 0.1851, "step": 31500 }, { "epoch": 3.06, "eval_loss": 0.48561614751815796, "eval_runtime": 1.4155, "eval_samples_per_second": 706.463, "eval_steps_per_second": 22.607, "step": 31500 }, { "epoch": 3.11, "learning_rate": 6.921422986708365e-05, "loss": 0.1888, "step": 32000 }, { "epoch": 3.11, "eval_loss": 0.48478779196739197, "eval_runtime": 1.4155, "eval_samples_per_second": 706.465, "eval_steps_per_second": 22.607, "step": 32000 }, { "epoch": 3.16, "learning_rate": 6.872556684910086e-05, "loss": 0.1916, "step": 32500 }, { "epoch": 3.16, "eval_loss": 0.4795476198196411, "eval_runtime": 1.4239, "eval_samples_per_second": 702.273, "eval_steps_per_second": 22.473, "step": 32500 }, { "epoch": 3.21, "learning_rate": 6.823690383111806e-05, "loss": 0.1932, "step": 33000 }, { "epoch": 3.21, "eval_loss": 0.47898271679878235, "eval_runtime": 1.42, "eval_samples_per_second": 704.241, "eval_steps_per_second": 22.536, "step": 33000 }, { "epoch": 3.26, "learning_rate": 6.774824081313526e-05, "loss": 0.1882, "step": 33500 }, { "epoch": 3.26, "eval_loss": 0.48221901059150696, "eval_runtime": 1.4234, "eval_samples_per_second": 702.521, "eval_steps_per_second": 22.481, "step": 33500 }, { "epoch": 3.31, "learning_rate": 6.725957779515246e-05, "loss": 0.1845, "step": 34000 }, { "epoch": 3.31, "eval_loss": 0.479130357503891, "eval_runtime": 1.4215, "eval_samples_per_second": 703.503, "eval_steps_per_second": 22.512, "step": 34000 }, { "epoch": 3.36, "learning_rate": 6.677091477716966e-05, "loss": 0.1895, "step": 34500 }, { "epoch": 3.36, "eval_loss": 0.4773789644241333, "eval_runtime": 1.4294, "eval_samples_per_second": 699.592, "eval_steps_per_second": 22.387, "step": 34500 }, { "epoch": 3.4, "learning_rate": 6.628225175918686e-05, "loss": 0.1909, "step": 35000 }, { "epoch": 3.4, "eval_loss": 0.4763247072696686, "eval_runtime": 1.409, "eval_samples_per_second": 709.702, "eval_steps_per_second": 22.71, "step": 35000 }, { "epoch": 3.45, "learning_rate": 6.579358874120408e-05, "loss": 0.1841, "step": 35500 }, { "epoch": 3.45, "eval_loss": 0.4759540259838104, "eval_runtime": 1.421, "eval_samples_per_second": 703.747, "eval_steps_per_second": 22.52, "step": 35500 }, { "epoch": 3.5, "learning_rate": 6.530492572322126e-05, "loss": 0.1882, "step": 36000 }, { "epoch": 3.5, "eval_loss": 0.4739590585231781, "eval_runtime": 1.417, "eval_samples_per_second": 705.724, "eval_steps_per_second": 22.583, "step": 36000 }, { "epoch": 3.55, "learning_rate": 6.481626270523847e-05, "loss": 0.1902, "step": 36500 }, { "epoch": 3.55, "eval_loss": 0.47059980034828186, "eval_runtime": 1.4215, "eval_samples_per_second": 703.504, "eval_steps_per_second": 22.512, "step": 36500 }, { "epoch": 3.6, "learning_rate": 6.432759968725567e-05, "loss": 0.1924, "step": 37000 }, { "epoch": 3.6, "eval_loss": 0.46917036175727844, "eval_runtime": 1.4388, "eval_samples_per_second": 695.005, "eval_steps_per_second": 22.24, "step": 37000 }, { "epoch": 3.65, "learning_rate": 6.383893666927287e-05, "loss": 0.1845, "step": 37500 }, { "epoch": 3.65, "eval_loss": 0.46856725215911865, "eval_runtime": 1.4135, "eval_samples_per_second": 707.458, "eval_steps_per_second": 22.639, "step": 37500 }, { "epoch": 3.7, "learning_rate": 6.335027365129007e-05, "loss": 0.1892, "step": 38000 }, { "epoch": 3.7, "eval_loss": 0.46638262271881104, "eval_runtime": 1.4632, "eval_samples_per_second": 683.445, "eval_steps_per_second": 21.87, "step": 38000 }, { "epoch": 3.74, "learning_rate": 6.286161063330727e-05, "loss": 0.1849, "step": 38500 }, { "epoch": 3.74, "eval_loss": 0.46737831830978394, "eval_runtime": 1.4319, "eval_samples_per_second": 698.377, "eval_steps_per_second": 22.348, "step": 38500 }, { "epoch": 3.79, "learning_rate": 6.237294761532447e-05, "loss": 0.1883, "step": 39000 }, { "epoch": 3.79, "eval_loss": 0.46423232555389404, "eval_runtime": 1.4249, "eval_samples_per_second": 701.787, "eval_steps_per_second": 22.457, "step": 39000 }, { "epoch": 3.84, "learning_rate": 6.188428459734168e-05, "loss": 0.1821, "step": 39500 }, { "epoch": 3.84, "eval_loss": 0.4651487171649933, "eval_runtime": 1.4135, "eval_samples_per_second": 707.454, "eval_steps_per_second": 22.639, "step": 39500 }, { "epoch": 3.89, "learning_rate": 6.139562157935887e-05, "loss": 0.1905, "step": 40000 }, { "epoch": 3.89, "eval_loss": 0.462035208940506, "eval_runtime": 1.417, "eval_samples_per_second": 705.722, "eval_steps_per_second": 22.583, "step": 40000 }, { "epoch": 3.94, "learning_rate": 6.090695856137608e-05, "loss": 0.185, "step": 40500 }, { "epoch": 3.94, "eval_loss": 0.4627071022987366, "eval_runtime": 1.4179, "eval_samples_per_second": 705.261, "eval_steps_per_second": 22.568, "step": 40500 }, { "epoch": 3.99, "learning_rate": 6.0418295543393276e-05, "loss": 0.19, "step": 41000 }, { "epoch": 3.99, "eval_loss": 0.4600967466831207, "eval_runtime": 1.422, "eval_samples_per_second": 703.255, "eval_steps_per_second": 22.504, "step": 41000 }, { "epoch": 4.04, "learning_rate": 5.992963252541048e-05, "loss": 0.1734, "step": 41500 }, { "epoch": 4.04, "eval_loss": 0.46835413575172424, "eval_runtime": 1.4105, "eval_samples_per_second": 708.959, "eval_steps_per_second": 22.687, "step": 41500 }, { "epoch": 4.08, "learning_rate": 5.944096950742768e-05, "loss": 0.1665, "step": 42000 }, { "epoch": 4.08, "eval_loss": 0.4674428403377533, "eval_runtime": 1.4224, "eval_samples_per_second": 703.013, "eval_steps_per_second": 22.496, "step": 42000 }, { "epoch": 4.13, "learning_rate": 5.895230648944489e-05, "loss": 0.1621, "step": 42500 }, { "epoch": 4.13, "eval_loss": 0.46939995884895325, "eval_runtime": 1.418, "eval_samples_per_second": 705.227, "eval_steps_per_second": 22.567, "step": 42500 }, { "epoch": 4.18, "learning_rate": 5.846364347146208e-05, "loss": 0.1633, "step": 43000 }, { "epoch": 4.18, "eval_loss": 0.4672936499118805, "eval_runtime": 1.4229, "eval_samples_per_second": 702.769, "eval_steps_per_second": 22.489, "step": 43000 }, { "epoch": 4.23, "learning_rate": 5.797498045347929e-05, "loss": 0.1612, "step": 43500 }, { "epoch": 4.23, "eval_loss": 0.4673324525356293, "eval_runtime": 1.42, "eval_samples_per_second": 704.241, "eval_steps_per_second": 22.536, "step": 43500 }, { "epoch": 4.28, "learning_rate": 5.7486317435496486e-05, "loss": 0.1644, "step": 44000 }, { "epoch": 4.28, "eval_loss": 0.4646117091178894, "eval_runtime": 1.4244, "eval_samples_per_second": 702.031, "eval_steps_per_second": 22.465, "step": 44000 }, { "epoch": 4.33, "learning_rate": 5.699765441751369e-05, "loss": 0.1655, "step": 44500 }, { "epoch": 4.33, "eval_loss": 0.46449655294418335, "eval_runtime": 1.4359, "eval_samples_per_second": 696.446, "eval_steps_per_second": 22.286, "step": 44500 }, { "epoch": 4.38, "learning_rate": 5.6508991399530885e-05, "loss": 0.1627, "step": 45000 }, { "epoch": 4.38, "eval_loss": 0.4624975025653839, "eval_runtime": 1.4244, "eval_samples_per_second": 702.032, "eval_steps_per_second": 22.465, "step": 45000 }, { "epoch": 4.43, "learning_rate": 5.6020328381548085e-05, "loss": 0.1675, "step": 45500 }, { "epoch": 4.43, "eval_loss": 0.46210145950317383, "eval_runtime": 1.4274, "eval_samples_per_second": 700.566, "eval_steps_per_second": 22.418, "step": 45500 }, { "epoch": 4.47, "learning_rate": 5.553166536356529e-05, "loss": 0.1648, "step": 46000 }, { "epoch": 4.47, "eval_loss": 0.4633449614048004, "eval_runtime": 1.4487, "eval_samples_per_second": 690.281, "eval_steps_per_second": 22.089, "step": 46000 }, { "epoch": 4.52, "learning_rate": 5.5043002345582483e-05, "loss": 0.1691, "step": 46500 }, { "epoch": 4.52, "eval_loss": 0.4609707295894623, "eval_runtime": 1.4403, "eval_samples_per_second": 694.283, "eval_steps_per_second": 22.217, "step": 46500 }, { "epoch": 4.57, "learning_rate": 5.455433932759969e-05, "loss": 0.1642, "step": 47000 }, { "epoch": 4.57, "eval_loss": 0.46095407009124756, "eval_runtime": 1.4319, "eval_samples_per_second": 698.378, "eval_steps_per_second": 22.348, "step": 47000 }, { "epoch": 4.62, "learning_rate": 5.406567630961689e-05, "loss": 0.1666, "step": 47500 }, { "epoch": 4.62, "eval_loss": 0.46066999435424805, "eval_runtime": 1.4264, "eval_samples_per_second": 701.053, "eval_steps_per_second": 22.434, "step": 47500 }, { "epoch": 4.67, "learning_rate": 5.3577013291634095e-05, "loss": 0.167, "step": 48000 }, { "epoch": 4.67, "eval_loss": 0.45587822794914246, "eval_runtime": 1.4344, "eval_samples_per_second": 697.169, "eval_steps_per_second": 22.309, "step": 48000 }, { "epoch": 4.72, "learning_rate": 5.308835027365129e-05, "loss": 0.1691, "step": 48500 }, { "epoch": 4.72, "eval_loss": 0.45380640029907227, "eval_runtime": 1.4344, "eval_samples_per_second": 697.166, "eval_steps_per_second": 22.309, "step": 48500 }, { "epoch": 4.77, "learning_rate": 5.2599687255668494e-05, "loss": 0.1674, "step": 49000 }, { "epoch": 4.77, "eval_loss": 0.45414891839027405, "eval_runtime": 1.4319, "eval_samples_per_second": 698.38, "eval_steps_per_second": 22.348, "step": 49000 }, { "epoch": 4.81, "learning_rate": 5.2111024237685694e-05, "loss": 0.1613, "step": 49500 }, { "epoch": 4.81, "eval_loss": 0.45553380250930786, "eval_runtime": 1.4458, "eval_samples_per_second": 691.661, "eval_steps_per_second": 22.133, "step": 49500 }, { "epoch": 4.86, "learning_rate": 5.16223612197029e-05, "loss": 0.1613, "step": 50000 }, { "epoch": 4.86, "eval_loss": 0.4527079463005066, "eval_runtime": 1.4195, "eval_samples_per_second": 704.487, "eval_steps_per_second": 22.544, "step": 50000 }, { "epoch": 4.91, "learning_rate": 5.113369820172009e-05, "loss": 0.1639, "step": 50500 }, { "epoch": 4.91, "eval_loss": 0.44933873414993286, "eval_runtime": 1.416, "eval_samples_per_second": 706.217, "eval_steps_per_second": 22.599, "step": 50500 }, { "epoch": 4.96, "learning_rate": 5.06450351837373e-05, "loss": 0.1685, "step": 51000 }, { "epoch": 4.96, "eval_loss": 0.44989633560180664, "eval_runtime": 1.4289, "eval_samples_per_second": 699.837, "eval_steps_per_second": 22.395, "step": 51000 }, { "epoch": 5.01, "learning_rate": 5.01563721657545e-05, "loss": 0.1629, "step": 51500 }, { "epoch": 5.01, "eval_loss": 0.45488646626472473, "eval_runtime": 1.4239, "eval_samples_per_second": 702.276, "eval_steps_per_second": 22.473, "step": 51500 }, { "epoch": 5.06, "learning_rate": 4.9667709147771705e-05, "loss": 0.1484, "step": 52000 }, { "epoch": 5.06, "eval_loss": 0.4580441415309906, "eval_runtime": 1.416, "eval_samples_per_second": 706.223, "eval_steps_per_second": 22.599, "step": 52000 }, { "epoch": 5.11, "learning_rate": 4.9179046129788904e-05, "loss": 0.1468, "step": 52500 }, { "epoch": 5.11, "eval_loss": 0.4577222168445587, "eval_runtime": 1.4304, "eval_samples_per_second": 699.106, "eval_steps_per_second": 22.371, "step": 52500 }, { "epoch": 5.15, "learning_rate": 4.86903831118061e-05, "loss": 0.147, "step": 53000 }, { "epoch": 5.15, "eval_loss": 0.4562654197216034, "eval_runtime": 1.4284, "eval_samples_per_second": 700.079, "eval_steps_per_second": 22.403, "step": 53000 }, { "epoch": 5.2, "learning_rate": 4.8201720093823296e-05, "loss": 0.1486, "step": 53500 }, { "epoch": 5.2, "eval_loss": 0.4565419852733612, "eval_runtime": 1.4264, "eval_samples_per_second": 701.053, "eval_steps_per_second": 22.434, "step": 53500 }, { "epoch": 5.25, "learning_rate": 4.77130570758405e-05, "loss": 0.1461, "step": 54000 }, { "epoch": 5.25, "eval_loss": 0.45437780022621155, "eval_runtime": 1.4279, "eval_samples_per_second": 700.322, "eval_steps_per_second": 22.41, "step": 54000 }, { "epoch": 5.3, "learning_rate": 4.72243940578577e-05, "loss": 0.1435, "step": 54500 }, { "epoch": 5.3, "eval_loss": 0.4550324082374573, "eval_runtime": 1.4289, "eval_samples_per_second": 699.835, "eval_steps_per_second": 22.395, "step": 54500 }, { "epoch": 5.35, "learning_rate": 4.67357310398749e-05, "loss": 0.1463, "step": 55000 }, { "epoch": 5.35, "eval_loss": 0.4553817808628082, "eval_runtime": 1.4304, "eval_samples_per_second": 699.106, "eval_steps_per_second": 22.371, "step": 55000 }, { "epoch": 5.4, "learning_rate": 4.62470680218921e-05, "loss": 0.1495, "step": 55500 }, { "epoch": 5.4, "eval_loss": 0.45491766929626465, "eval_runtime": 1.4195, "eval_samples_per_second": 704.489, "eval_steps_per_second": 22.544, "step": 55500 }, { "epoch": 5.45, "learning_rate": 4.575840500390931e-05, "loss": 0.143, "step": 56000 }, { "epoch": 5.45, "eval_loss": 0.45461103320121765, "eval_runtime": 1.4225, "eval_samples_per_second": 703.01, "eval_steps_per_second": 22.496, "step": 56000 }, { "epoch": 5.5, "learning_rate": 4.5269741985926506e-05, "loss": 0.1473, "step": 56500 }, { "epoch": 5.5, "eval_loss": 0.4515800178050995, "eval_runtime": 1.4373, "eval_samples_per_second": 695.726, "eval_steps_per_second": 22.263, "step": 56500 }, { "epoch": 5.54, "learning_rate": 4.4781078967943706e-05, "loss": 0.1481, "step": 57000 }, { "epoch": 5.54, "eval_loss": 0.45015862584114075, "eval_runtime": 1.4259, "eval_samples_per_second": 701.296, "eval_steps_per_second": 22.441, "step": 57000 }, { "epoch": 5.59, "learning_rate": 4.4292415949960905e-05, "loss": 0.1494, "step": 57500 }, { "epoch": 5.59, "eval_loss": 0.4483198821544647, "eval_runtime": 1.4359, "eval_samples_per_second": 696.444, "eval_steps_per_second": 22.286, "step": 57500 }, { "epoch": 5.64, "learning_rate": 4.380375293197811e-05, "loss": 0.1413, "step": 58000 }, { "epoch": 5.64, "eval_loss": 0.4498542249202728, "eval_runtime": 1.4473, "eval_samples_per_second": 690.949, "eval_steps_per_second": 22.11, "step": 58000 }, { "epoch": 5.69, "learning_rate": 4.331508991399531e-05, "loss": 0.1498, "step": 58500 }, { "epoch": 5.69, "eval_loss": 0.447781503200531, "eval_runtime": 1.4225, "eval_samples_per_second": 703.012, "eval_steps_per_second": 22.496, "step": 58500 }, { "epoch": 5.74, "learning_rate": 4.282642689601251e-05, "loss": 0.146, "step": 59000 }, { "epoch": 5.74, "eval_loss": 0.4458942115306854, "eval_runtime": 1.413, "eval_samples_per_second": 707.707, "eval_steps_per_second": 22.647, "step": 59000 }, { "epoch": 5.79, "learning_rate": 4.233776387802971e-05, "loss": 0.1455, "step": 59500 }, { "epoch": 5.79, "eval_loss": 0.44468608498573303, "eval_runtime": 1.4239, "eval_samples_per_second": 702.274, "eval_steps_per_second": 22.473, "step": 59500 }, { "epoch": 5.84, "learning_rate": 4.1849100860046916e-05, "loss": 0.1439, "step": 60000 }, { "epoch": 5.84, "eval_loss": 0.446806401014328, "eval_runtime": 1.4224, "eval_samples_per_second": 703.012, "eval_steps_per_second": 22.496, "step": 60000 }, { "epoch": 5.88, "learning_rate": 4.1360437842064116e-05, "loss": 0.1472, "step": 60500 }, { "epoch": 5.88, "eval_loss": 0.44735094904899597, "eval_runtime": 1.4315, "eval_samples_per_second": 698.575, "eval_steps_per_second": 22.354, "step": 60500 }, { "epoch": 5.93, "learning_rate": 4.0871774824081315e-05, "loss": 0.1481, "step": 61000 }, { "epoch": 5.93, "eval_loss": 0.4440445303916931, "eval_runtime": 1.4374, "eval_samples_per_second": 695.724, "eval_steps_per_second": 22.263, "step": 61000 }, { "epoch": 5.98, "learning_rate": 4.0383111806098515e-05, "loss": 0.1462, "step": 61500 }, { "epoch": 5.98, "eval_loss": 0.44478389620780945, "eval_runtime": 1.4258, "eval_samples_per_second": 701.358, "eval_steps_per_second": 22.443, "step": 61500 }, { "epoch": 6.03, "learning_rate": 3.989444878811572e-05, "loss": 0.1335, "step": 62000 }, { "epoch": 6.03, "eval_loss": 0.450600266456604, "eval_runtime": 1.4249, "eval_samples_per_second": 701.787, "eval_steps_per_second": 22.457, "step": 62000 }, { "epoch": 6.08, "learning_rate": 3.940578577013292e-05, "loss": 0.1279, "step": 62500 }, { "epoch": 6.08, "eval_loss": 0.45286017656326294, "eval_runtime": 1.4175, "eval_samples_per_second": 705.475, "eval_steps_per_second": 22.575, "step": 62500 }, { "epoch": 6.13, "learning_rate": 3.891712275215012e-05, "loss": 0.1318, "step": 63000 }, { "epoch": 6.13, "eval_loss": 0.4531707763671875, "eval_runtime": 1.4314, "eval_samples_per_second": 698.621, "eval_steps_per_second": 22.356, "step": 63000 }, { "epoch": 6.18, "learning_rate": 3.842845973416732e-05, "loss": 0.1297, "step": 63500 }, { "epoch": 6.18, "eval_loss": 0.4539336562156677, "eval_runtime": 1.422, "eval_samples_per_second": 703.255, "eval_steps_per_second": 22.504, "step": 63500 }, { "epoch": 6.22, "learning_rate": 3.7939796716184525e-05, "loss": 0.1314, "step": 64000 }, { "epoch": 6.22, "eval_loss": 0.4507242441177368, "eval_runtime": 1.4354, "eval_samples_per_second": 696.689, "eval_steps_per_second": 22.294, "step": 64000 }, { "epoch": 6.27, "learning_rate": 3.7451133698201725e-05, "loss": 0.1295, "step": 64500 }, { "epoch": 6.27, "eval_loss": 0.45251962542533875, "eval_runtime": 1.4215, "eval_samples_per_second": 703.503, "eval_steps_per_second": 22.512, "step": 64500 }, { "epoch": 6.32, "learning_rate": 3.6962470680218924e-05, "loss": 0.1311, "step": 65000 }, { "epoch": 6.32, "eval_loss": 0.45232245326042175, "eval_runtime": 1.4364, "eval_samples_per_second": 696.202, "eval_steps_per_second": 22.278, "step": 65000 }, { "epoch": 6.37, "learning_rate": 3.6473807662236124e-05, "loss": 0.1303, "step": 65500 }, { "epoch": 6.37, "eval_loss": 0.45104601979255676, "eval_runtime": 1.421, "eval_samples_per_second": 703.749, "eval_steps_per_second": 22.52, "step": 65500 }, { "epoch": 6.42, "learning_rate": 3.598514464425333e-05, "loss": 0.1289, "step": 66000 }, { "epoch": 6.42, "eval_loss": 0.44871556758880615, "eval_runtime": 1.4274, "eval_samples_per_second": 700.563, "eval_steps_per_second": 22.418, "step": 66000 }, { "epoch": 6.47, "learning_rate": 3.549648162627053e-05, "loss": 0.1375, "step": 66500 }, { "epoch": 6.47, "eval_loss": 0.4471152126789093, "eval_runtime": 1.4185, "eval_samples_per_second": 704.978, "eval_steps_per_second": 22.559, "step": 66500 }, { "epoch": 6.52, "learning_rate": 3.500781860828773e-05, "loss": 0.1295, "step": 67000 }, { "epoch": 6.52, "eval_loss": 0.44929370284080505, "eval_runtime": 1.414, "eval_samples_per_second": 707.208, "eval_steps_per_second": 22.631, "step": 67000 }, { "epoch": 6.56, "learning_rate": 3.451915559030492e-05, "loss": 0.1291, "step": 67500 }, { "epoch": 6.56, "eval_loss": 0.44738081097602844, "eval_runtime": 1.4095, "eval_samples_per_second": 709.453, "eval_steps_per_second": 22.703, "step": 67500 }, { "epoch": 6.61, "learning_rate": 3.403049257232213e-05, "loss": 0.1297, "step": 68000 }, { "epoch": 6.61, "eval_loss": 0.4483153820037842, "eval_runtime": 1.4314, "eval_samples_per_second": 698.622, "eval_steps_per_second": 22.356, "step": 68000 }, { "epoch": 6.66, "learning_rate": 3.354182955433933e-05, "loss": 0.1354, "step": 68500 }, { "epoch": 6.66, "eval_loss": 0.4452635943889618, "eval_runtime": 1.4074, "eval_samples_per_second": 710.532, "eval_steps_per_second": 22.737, "step": 68500 }, { "epoch": 6.71, "learning_rate": 3.305316653635653e-05, "loss": 0.1316, "step": 69000 }, { "epoch": 6.71, "eval_loss": 0.4459252953529358, "eval_runtime": 1.4149, "eval_samples_per_second": 706.773, "eval_steps_per_second": 22.617, "step": 69000 }, { "epoch": 6.76, "learning_rate": 3.2564503518373726e-05, "loss": 0.1303, "step": 69500 }, { "epoch": 6.76, "eval_loss": 0.4454708397388458, "eval_runtime": 1.4195, "eval_samples_per_second": 704.485, "eval_steps_per_second": 22.544, "step": 69500 }, { "epoch": 6.81, "learning_rate": 3.207584050039093e-05, "loss": 0.1352, "step": 70000 }, { "epoch": 6.81, "eval_loss": 0.4453655481338501, "eval_runtime": 1.4036, "eval_samples_per_second": 712.466, "eval_steps_per_second": 22.799, "step": 70000 }, { "epoch": 6.86, "learning_rate": 3.158717748240813e-05, "loss": 0.1278, "step": 70500 }, { "epoch": 6.86, "eval_loss": 0.44641512632369995, "eval_runtime": 1.4284, "eval_samples_per_second": 700.077, "eval_steps_per_second": 22.402, "step": 70500 }, { "epoch": 6.91, "learning_rate": 3.109851446442533e-05, "loss": 0.127, "step": 71000 }, { "epoch": 6.91, "eval_loss": 0.446814626455307, "eval_runtime": 1.4239, "eval_samples_per_second": 702.277, "eval_steps_per_second": 22.473, "step": 71000 }, { "epoch": 6.95, "learning_rate": 3.060985144644253e-05, "loss": 0.1337, "step": 71500 }, { "epoch": 6.95, "eval_loss": 0.44566431641578674, "eval_runtime": 1.421, "eval_samples_per_second": 703.749, "eval_steps_per_second": 22.52, "step": 71500 }, { "epoch": 7.0, "learning_rate": 3.0121188428459734e-05, "loss": 0.1322, "step": 72000 }, { "epoch": 7.0, "eval_loss": 0.4453600347042084, "eval_runtime": 1.3961, "eval_samples_per_second": 716.266, "eval_steps_per_second": 22.921, "step": 72000 }, { "epoch": 7.05, "learning_rate": 2.9632525410476936e-05, "loss": 0.1171, "step": 72500 }, { "epoch": 7.05, "eval_loss": 0.45082348585128784, "eval_runtime": 1.4021, "eval_samples_per_second": 713.223, "eval_steps_per_second": 22.823, "step": 72500 }, { "epoch": 7.1, "learning_rate": 2.9143862392494136e-05, "loss": 0.1201, "step": 73000 }, { "epoch": 7.1, "eval_loss": 0.45133039355278015, "eval_runtime": 1.4284, "eval_samples_per_second": 700.08, "eval_steps_per_second": 22.403, "step": 73000 }, { "epoch": 7.15, "learning_rate": 2.865519937451134e-05, "loss": 0.1119, "step": 73500 }, { "epoch": 7.15, "eval_loss": 0.4528238773345947, "eval_runtime": 1.4066, "eval_samples_per_second": 710.952, "eval_steps_per_second": 22.75, "step": 73500 }, { "epoch": 7.2, "learning_rate": 2.8166536356528538e-05, "loss": 0.1178, "step": 74000 }, { "epoch": 7.2, "eval_loss": 0.4517793655395508, "eval_runtime": 1.4279, "eval_samples_per_second": 700.319, "eval_steps_per_second": 22.41, "step": 74000 }, { "epoch": 7.25, "learning_rate": 2.767787333854574e-05, "loss": 0.1172, "step": 74500 }, { "epoch": 7.25, "eval_loss": 0.45097029209136963, "eval_runtime": 1.4105, "eval_samples_per_second": 708.955, "eval_steps_per_second": 22.687, "step": 74500 }, { "epoch": 7.29, "learning_rate": 2.718921032056294e-05, "loss": 0.1229, "step": 75000 }, { "epoch": 7.29, "eval_loss": 0.4481058418750763, "eval_runtime": 1.4051, "eval_samples_per_second": 711.694, "eval_steps_per_second": 22.774, "step": 75000 }, { "epoch": 7.34, "learning_rate": 2.6700547302580143e-05, "loss": 0.12, "step": 75500 }, { "epoch": 7.34, "eval_loss": 0.4482279419898987, "eval_runtime": 1.411, "eval_samples_per_second": 708.702, "eval_steps_per_second": 22.678, "step": 75500 }, { "epoch": 7.39, "learning_rate": 2.6211884284597343e-05, "loss": 0.1158, "step": 76000 }, { "epoch": 7.39, "eval_loss": 0.4505749046802521, "eval_runtime": 1.4021, "eval_samples_per_second": 713.221, "eval_steps_per_second": 22.823, "step": 76000 }, { "epoch": 7.44, "learning_rate": 2.5723221266614546e-05, "loss": 0.1212, "step": 76500 }, { "epoch": 7.44, "eval_loss": 0.4481782913208008, "eval_runtime": 1.4165, "eval_samples_per_second": 705.97, "eval_steps_per_second": 22.591, "step": 76500 }, { "epoch": 7.49, "learning_rate": 2.5234558248631745e-05, "loss": 0.1189, "step": 77000 }, { "epoch": 7.49, "eval_loss": 0.44906875491142273, "eval_runtime": 1.4185, "eval_samples_per_second": 704.978, "eval_steps_per_second": 22.559, "step": 77000 }, { "epoch": 7.54, "learning_rate": 2.4745895230648948e-05, "loss": 0.1225, "step": 77500 }, { "epoch": 7.54, "eval_loss": 0.4473673701286316, "eval_runtime": 1.408, "eval_samples_per_second": 710.203, "eval_steps_per_second": 22.727, "step": 77500 }, { "epoch": 7.59, "learning_rate": 2.4257232212666147e-05, "loss": 0.1206, "step": 78000 }, { "epoch": 7.59, "eval_loss": 0.4478332996368408, "eval_runtime": 1.4299, "eval_samples_per_second": 699.349, "eval_steps_per_second": 22.379, "step": 78000 }, { "epoch": 7.63, "learning_rate": 2.376856919468335e-05, "loss": 0.1205, "step": 78500 }, { "epoch": 7.63, "eval_loss": 0.4450225234031677, "eval_runtime": 1.4139, "eval_samples_per_second": 707.288, "eval_steps_per_second": 22.633, "step": 78500 }, { "epoch": 7.68, "learning_rate": 2.3279906176700546e-05, "loss": 0.1237, "step": 79000 }, { "epoch": 7.68, "eval_loss": 0.44548895955085754, "eval_runtime": 1.418, "eval_samples_per_second": 705.229, "eval_steps_per_second": 22.567, "step": 79000 }, { "epoch": 7.73, "learning_rate": 2.279124315871775e-05, "loss": 0.1211, "step": 79500 }, { "epoch": 7.73, "eval_loss": 0.4440496861934662, "eval_runtime": 1.4428, "eval_samples_per_second": 693.091, "eval_steps_per_second": 22.179, "step": 79500 }, { "epoch": 7.78, "learning_rate": 2.230258014073495e-05, "loss": 0.1167, "step": 80000 }, { "epoch": 7.78, "eval_loss": 0.44403979182243347, "eval_runtime": 1.415, "eval_samples_per_second": 706.709, "eval_steps_per_second": 22.615, "step": 80000 }, { "epoch": 7.83, "learning_rate": 2.181391712275215e-05, "loss": 0.1195, "step": 80500 }, { "epoch": 7.83, "eval_loss": 0.44386279582977295, "eval_runtime": 1.411, "eval_samples_per_second": 708.704, "eval_steps_per_second": 22.679, "step": 80500 }, { "epoch": 7.88, "learning_rate": 2.132525410476935e-05, "loss": 0.1236, "step": 81000 }, { "epoch": 7.88, "eval_loss": 0.4440469443798065, "eval_runtime": 1.413, "eval_samples_per_second": 707.702, "eval_steps_per_second": 22.646, "step": 81000 }, { "epoch": 7.93, "learning_rate": 2.0836591086786554e-05, "loss": 0.1196, "step": 81500 }, { "epoch": 7.93, "eval_loss": 0.4431215822696686, "eval_runtime": 1.4071, "eval_samples_per_second": 710.704, "eval_steps_per_second": 22.743, "step": 81500 }, { "epoch": 7.98, "learning_rate": 2.0347928068803753e-05, "loss": 0.1154, "step": 82000 }, { "epoch": 7.98, "eval_loss": 0.44494298100471497, "eval_runtime": 1.3991, "eval_samples_per_second": 714.74, "eval_steps_per_second": 22.872, "step": 82000 }, { "epoch": 8.02, "learning_rate": 1.9859265050820956e-05, "loss": 0.1146, "step": 82500 }, { "epoch": 8.02, "eval_loss": 0.44438567757606506, "eval_runtime": 1.4136, "eval_samples_per_second": 707.391, "eval_steps_per_second": 22.637, "step": 82500 }, { "epoch": 8.07, "learning_rate": 1.9370602032838155e-05, "loss": 0.111, "step": 83000 }, { "epoch": 8.07, "eval_loss": 0.4510573744773865, "eval_runtime": 1.4011, "eval_samples_per_second": 713.727, "eval_steps_per_second": 22.839, "step": 83000 }, { "epoch": 8.12, "learning_rate": 1.8881939014855358e-05, "loss": 0.1107, "step": 83500 }, { "epoch": 8.12, "eval_loss": 0.45002079010009766, "eval_runtime": 1.4195, "eval_samples_per_second": 704.487, "eval_steps_per_second": 22.544, "step": 83500 }, { "epoch": 8.17, "learning_rate": 1.8393275996872558e-05, "loss": 0.1069, "step": 84000 }, { "epoch": 8.17, "eval_loss": 0.4500637352466583, "eval_runtime": 1.4026, "eval_samples_per_second": 712.972, "eval_steps_per_second": 22.815, "step": 84000 }, { "epoch": 8.22, "learning_rate": 1.790461297888976e-05, "loss": 0.1091, "step": 84500 }, { "epoch": 8.22, "eval_loss": 0.450139582157135, "eval_runtime": 1.423, "eval_samples_per_second": 702.764, "eval_steps_per_second": 22.488, "step": 84500 }, { "epoch": 8.27, "learning_rate": 1.741594996090696e-05, "loss": 0.1107, "step": 85000 }, { "epoch": 8.27, "eval_loss": 0.4503149390220642, "eval_runtime": 1.407, "eval_samples_per_second": 710.707, "eval_steps_per_second": 22.743, "step": 85000 }, { "epoch": 8.32, "learning_rate": 1.6927286942924163e-05, "loss": 0.11, "step": 85500 }, { "epoch": 8.32, "eval_loss": 0.4499202370643616, "eval_runtime": 1.3956, "eval_samples_per_second": 716.522, "eval_steps_per_second": 22.929, "step": 85500 }, { "epoch": 8.36, "learning_rate": 1.6438623924941362e-05, "loss": 0.1084, "step": 86000 }, { "epoch": 8.36, "eval_loss": 0.4492938220500946, "eval_runtime": 1.4061, "eval_samples_per_second": 711.208, "eval_steps_per_second": 22.759, "step": 86000 }, { "epoch": 8.41, "learning_rate": 1.5949960906958562e-05, "loss": 0.1142, "step": 86500 }, { "epoch": 8.41, "eval_loss": 0.449333518743515, "eval_runtime": 1.417, "eval_samples_per_second": 705.721, "eval_steps_per_second": 22.583, "step": 86500 }, { "epoch": 8.46, "learning_rate": 1.546129788897576e-05, "loss": 0.1091, "step": 87000 }, { "epoch": 8.46, "eval_loss": 0.4506095051765442, "eval_runtime": 1.412, "eval_samples_per_second": 708.206, "eval_steps_per_second": 22.663, "step": 87000 }, { "epoch": 8.51, "learning_rate": 1.4972634870992962e-05, "loss": 0.1072, "step": 87500 }, { "epoch": 8.51, "eval_loss": 0.44918569922447205, "eval_runtime": 1.4021, "eval_samples_per_second": 713.221, "eval_steps_per_second": 22.823, "step": 87500 }, { "epoch": 8.56, "learning_rate": 1.4483971853010164e-05, "loss": 0.1128, "step": 88000 }, { "epoch": 8.56, "eval_loss": 0.44889140129089355, "eval_runtime": 1.414, "eval_samples_per_second": 707.21, "eval_steps_per_second": 22.631, "step": 88000 }, { "epoch": 8.61, "learning_rate": 1.3995308835027365e-05, "loss": 0.1123, "step": 88500 }, { "epoch": 8.61, "eval_loss": 0.44764241576194763, "eval_runtime": 1.4076, "eval_samples_per_second": 710.451, "eval_steps_per_second": 22.734, "step": 88500 }, { "epoch": 8.66, "learning_rate": 1.3506645817044566e-05, "loss": 0.1093, "step": 89000 }, { "epoch": 8.66, "eval_loss": 0.44862595200538635, "eval_runtime": 1.3991, "eval_samples_per_second": 714.742, "eval_steps_per_second": 22.872, "step": 89000 }, { "epoch": 8.7, "learning_rate": 1.3017982799061767e-05, "loss": 0.1111, "step": 89500 }, { "epoch": 8.7, "eval_loss": 0.4482482969760895, "eval_runtime": 1.3981, "eval_samples_per_second": 715.242, "eval_steps_per_second": 22.888, "step": 89500 }, { "epoch": 8.75, "learning_rate": 1.2529319781078968e-05, "loss": 0.1086, "step": 90000 }, { "epoch": 8.75, "eval_loss": 0.44752514362335205, "eval_runtime": 1.4036, "eval_samples_per_second": 712.46, "eval_steps_per_second": 22.799, "step": 90000 }, { "epoch": 8.8, "learning_rate": 1.204065676309617e-05, "loss": 0.11, "step": 90500 }, { "epoch": 8.8, "eval_loss": 0.44697925448417664, "eval_runtime": 1.401, "eval_samples_per_second": 713.773, "eval_steps_per_second": 22.841, "step": 90500 }, { "epoch": 8.85, "learning_rate": 1.155199374511337e-05, "loss": 0.1118, "step": 91000 }, { "epoch": 8.85, "eval_loss": 0.4479255974292755, "eval_runtime": 1.4107, "eval_samples_per_second": 708.879, "eval_steps_per_second": 22.684, "step": 91000 }, { "epoch": 8.9, "learning_rate": 1.1063330727130572e-05, "loss": 0.1078, "step": 91500 }, { "epoch": 8.9, "eval_loss": 0.4473107159137726, "eval_runtime": 1.4011, "eval_samples_per_second": 713.727, "eval_steps_per_second": 22.839, "step": 91500 }, { "epoch": 8.95, "learning_rate": 1.0574667709147771e-05, "loss": 0.1083, "step": 92000 }, { "epoch": 8.95, "eval_loss": 0.44750386476516724, "eval_runtime": 1.4056, "eval_samples_per_second": 711.457, "eval_steps_per_second": 22.767, "step": 92000 }, { "epoch": 9.0, "learning_rate": 1.0086004691164972e-05, "loss": 0.1127, "step": 92500 }, { "epoch": 9.0, "eval_loss": 0.4473421573638916, "eval_runtime": 1.4175, "eval_samples_per_second": 705.475, "eval_steps_per_second": 22.575, "step": 92500 }, { "epoch": 9.04, "learning_rate": 9.597341673182173e-06, "loss": 0.1034, "step": 93000 }, { "epoch": 9.04, "eval_loss": 0.4490604102611542, "eval_runtime": 1.4334, "eval_samples_per_second": 697.652, "eval_steps_per_second": 22.325, "step": 93000 }, { "epoch": 9.09, "learning_rate": 9.108678655199375e-06, "loss": 0.1022, "step": 93500 }, { "epoch": 9.09, "eval_loss": 0.45033252239227295, "eval_runtime": 1.4284, "eval_samples_per_second": 700.078, "eval_steps_per_second": 22.403, "step": 93500 }, { "epoch": 9.14, "learning_rate": 8.620015637216576e-06, "loss": 0.1048, "step": 94000 }, { "epoch": 9.14, "eval_loss": 0.45117974281311035, "eval_runtime": 1.412, "eval_samples_per_second": 708.205, "eval_steps_per_second": 22.663, "step": 94000 }, { "epoch": 9.19, "learning_rate": 8.131352619233777e-06, "loss": 0.1018, "step": 94500 }, { "epoch": 9.19, "eval_loss": 0.45118698477745056, "eval_runtime": 1.421, "eval_samples_per_second": 703.749, "eval_steps_per_second": 22.52, "step": 94500 }, { "epoch": 9.24, "learning_rate": 7.642689601250978e-06, "loss": 0.1064, "step": 95000 }, { "epoch": 9.24, "eval_loss": 0.4503132402896881, "eval_runtime": 1.4151, "eval_samples_per_second": 706.64, "eval_steps_per_second": 22.612, "step": 95000 }, { "epoch": 9.29, "learning_rate": 7.154026583268178e-06, "loss": 0.1055, "step": 95500 }, { "epoch": 9.29, "eval_loss": 0.4488275647163391, "eval_runtime": 1.417, "eval_samples_per_second": 705.722, "eval_steps_per_second": 22.583, "step": 95500 }, { "epoch": 9.34, "learning_rate": 6.665363565285379e-06, "loss": 0.1125, "step": 96000 }, { "epoch": 9.34, "eval_loss": 0.44890880584716797, "eval_runtime": 1.4006, "eval_samples_per_second": 713.983, "eval_steps_per_second": 22.847, "step": 96000 }, { "epoch": 9.39, "learning_rate": 6.1767005473025806e-06, "loss": 0.1002, "step": 96500 }, { "epoch": 9.39, "eval_loss": 0.44921454787254333, "eval_runtime": 1.4134, "eval_samples_per_second": 707.5, "eval_steps_per_second": 22.64, "step": 96500 }, { "epoch": 9.43, "learning_rate": 5.688037529319782e-06, "loss": 0.1043, "step": 97000 }, { "epoch": 9.43, "eval_loss": 0.4495786130428314, "eval_runtime": 1.408, "eval_samples_per_second": 710.206, "eval_steps_per_second": 22.727, "step": 97000 }, { "epoch": 9.48, "learning_rate": 5.199374511336982e-06, "loss": 0.102, "step": 97500 }, { "epoch": 9.48, "eval_loss": 0.44972699880599976, "eval_runtime": 1.4141, "eval_samples_per_second": 707.139, "eval_steps_per_second": 22.628, "step": 97500 }, { "epoch": 9.53, "learning_rate": 4.710711493354183e-06, "loss": 0.1059, "step": 98000 }, { "epoch": 9.53, "eval_loss": 0.44945281744003296, "eval_runtime": 1.4001, "eval_samples_per_second": 714.235, "eval_steps_per_second": 22.856, "step": 98000 }, { "epoch": 9.58, "learning_rate": 4.222048475371384e-06, "loss": 0.1012, "step": 98500 }, { "epoch": 9.58, "eval_loss": 0.44921252131462097, "eval_runtime": 1.4051, "eval_samples_per_second": 711.709, "eval_steps_per_second": 22.775, "step": 98500 }, { "epoch": 9.63, "learning_rate": 3.733385457388585e-06, "loss": 0.1066, "step": 99000 }, { "epoch": 9.63, "eval_loss": 0.44877171516418457, "eval_runtime": 1.42, "eval_samples_per_second": 704.241, "eval_steps_per_second": 22.536, "step": 99000 }, { "epoch": 9.68, "learning_rate": 3.244722439405786e-06, "loss": 0.1044, "step": 99500 }, { "epoch": 9.68, "eval_loss": 0.4489387273788452, "eval_runtime": 1.4031, "eval_samples_per_second": 712.712, "eval_steps_per_second": 22.807, "step": 99500 }, { "epoch": 9.73, "learning_rate": 2.756059421422987e-06, "loss": 0.1049, "step": 100000 }, { "epoch": 9.73, "eval_loss": 0.4488721191883087, "eval_runtime": 1.4041, "eval_samples_per_second": 712.211, "eval_steps_per_second": 22.791, "step": 100000 }, { "epoch": 9.77, "learning_rate": 2.2673964034401876e-06, "loss": 0.1038, "step": 100500 }, { "epoch": 9.77, "eval_loss": 0.44900763034820557, "eval_runtime": 1.4001, "eval_samples_per_second": 714.232, "eval_steps_per_second": 22.855, "step": 100500 }, { "epoch": 9.82, "learning_rate": 1.7787333854573888e-06, "loss": 0.1057, "step": 101000 }, { "epoch": 9.82, "eval_loss": 0.44878125190734863, "eval_runtime": 1.409, "eval_samples_per_second": 709.704, "eval_steps_per_second": 22.711, "step": 101000 }, { "epoch": 9.87, "learning_rate": 1.2900703674745897e-06, "loss": 0.1035, "step": 101500 }, { "epoch": 9.87, "eval_loss": 0.448639452457428, "eval_runtime": 1.4105, "eval_samples_per_second": 708.955, "eval_steps_per_second": 22.687, "step": 101500 }, { "epoch": 9.92, "learning_rate": 8.014073494917906e-07, "loss": 0.1058, "step": 102000 }, { "epoch": 9.92, "eval_loss": 0.4486231207847595, "eval_runtime": 1.3971, "eval_samples_per_second": 715.758, "eval_steps_per_second": 22.904, "step": 102000 }, { "epoch": 9.97, "learning_rate": 3.1274433150899144e-07, "loss": 0.1013, "step": 102500 }, { "epoch": 9.97, "eval_loss": 0.44872137904167175, "eval_runtime": 1.4115, "eval_samples_per_second": 708.454, "eval_steps_per_second": 22.671, "step": 102500 } ], "max_steps": 102820, "num_train_epochs": 10, "total_flos": 6.16302255744e+16, "trial_name": null, "trial_params": null }