|
{ |
|
"best_metric": 0.20996317267417908, |
|
"best_model_checkpoint": "./model_outputs/checkpoint-9750", |
|
"epoch": 1.9240878505041734, |
|
"eval_steps": 50, |
|
"global_step": 9750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009866408824516053, |
|
"grad_norm": 0.5440589189529419, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.8768, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009866408824516053, |
|
"eval_loss": 1.8065277338027954, |
|
"eval_runtime": 0.199, |
|
"eval_samples_per_second": 25.12, |
|
"eval_steps_per_second": 5.024, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.019732817649032106, |
|
"grad_norm": 0.5126028656959534, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.6013, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.019732817649032106, |
|
"eval_loss": 1.4184890985488892, |
|
"eval_runtime": 0.1842, |
|
"eval_samples_per_second": 27.145, |
|
"eval_steps_per_second": 5.429, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02959922647354816, |
|
"grad_norm": 0.8953598141670227, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3394, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02959922647354816, |
|
"eval_loss": 1.2211127281188965, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.095, |
|
"eval_steps_per_second": 5.419, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03946563529806421, |
|
"grad_norm": 1.0633267164230347, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.2225, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03946563529806421, |
|
"eval_loss": 1.1311841011047363, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 27.117, |
|
"eval_steps_per_second": 5.423, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.049332044122580264, |
|
"grad_norm": 1.340325951576233, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.1527, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.049332044122580264, |
|
"eval_loss": 1.0357060432434082, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 27.116, |
|
"eval_steps_per_second": 5.423, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05919845294709632, |
|
"grad_norm": 1.1488324403762817, |
|
"learning_rate": 5e-05, |
|
"loss": 1.098, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05919845294709632, |
|
"eval_loss": 0.9791008830070496, |
|
"eval_runtime": 0.1852, |
|
"eval_samples_per_second": 27.002, |
|
"eval_steps_per_second": 5.4, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06906486177161236, |
|
"grad_norm": 1.3122004270553589, |
|
"learning_rate": 5.833333333333334e-05, |
|
"loss": 1.0511, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06906486177161236, |
|
"eval_loss": 0.9438408017158508, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 27.119, |
|
"eval_steps_per_second": 5.424, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07893127059612842, |
|
"grad_norm": 1.4071924686431885, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.0327, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07893127059612842, |
|
"eval_loss": 0.9280341863632202, |
|
"eval_runtime": 0.1839, |
|
"eval_samples_per_second": 27.193, |
|
"eval_steps_per_second": 5.439, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08879767942064447, |
|
"grad_norm": 1.447715163230896, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.9924, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08879767942064447, |
|
"eval_loss": 0.9004782438278198, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.09, |
|
"eval_steps_per_second": 5.418, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.09866408824516053, |
|
"grad_norm": 1.6754121780395508, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.9313, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09866408824516053, |
|
"eval_loss": 0.7985298037528992, |
|
"eval_runtime": 0.1849, |
|
"eval_samples_per_second": 27.04, |
|
"eval_steps_per_second": 5.408, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10853049706967657, |
|
"grad_norm": 1.7146815061569214, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 0.7311, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.10853049706967657, |
|
"eval_loss": 0.5294134020805359, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.107, |
|
"eval_steps_per_second": 5.421, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.11839690589419263, |
|
"grad_norm": 1.3303102254867554, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5309, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11839690589419263, |
|
"eval_loss": 0.42509689927101135, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.075, |
|
"eval_steps_per_second": 5.415, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1282633147187087, |
|
"grad_norm": 0.9372690320014954, |
|
"learning_rate": 0.00010833333333333333, |
|
"loss": 0.4645, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1282633147187087, |
|
"eval_loss": 0.37647953629493713, |
|
"eval_runtime": 0.1851, |
|
"eval_samples_per_second": 27.009, |
|
"eval_steps_per_second": 5.402, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.13812972354322473, |
|
"grad_norm": 0.9396585822105408, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 0.4258, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13812972354322473, |
|
"eval_loss": 0.3551653325557709, |
|
"eval_runtime": 0.1853, |
|
"eval_samples_per_second": 26.982, |
|
"eval_steps_per_second": 5.396, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.14799613236774078, |
|
"grad_norm": 0.6549014449119568, |
|
"learning_rate": 0.000125, |
|
"loss": 0.4101, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.14799613236774078, |
|
"eval_loss": 0.3371706008911133, |
|
"eval_runtime": 0.1856, |
|
"eval_samples_per_second": 26.941, |
|
"eval_steps_per_second": 5.388, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.15786254119225684, |
|
"grad_norm": 0.5201817750930786, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.3877, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15786254119225684, |
|
"eval_loss": 0.3209473192691803, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 27.12, |
|
"eval_steps_per_second": 5.424, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1677289500167729, |
|
"grad_norm": 0.44689691066741943, |
|
"learning_rate": 0.00014166666666666668, |
|
"loss": 0.3856, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.1677289500167729, |
|
"eval_loss": 0.3198350965976715, |
|
"eval_runtime": 0.1841, |
|
"eval_samples_per_second": 27.157, |
|
"eval_steps_per_second": 5.431, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.17759535884128894, |
|
"grad_norm": 0.4665811359882355, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.3815, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.17759535884128894, |
|
"eval_loss": 0.31239432096481323, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 26.91, |
|
"eval_steps_per_second": 5.382, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.187461767665805, |
|
"grad_norm": 0.40847715735435486, |
|
"learning_rate": 0.00015833333333333332, |
|
"loss": 0.3678, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.187461767665805, |
|
"eval_loss": 0.31479328870773315, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.067, |
|
"eval_steps_per_second": 5.413, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.19732817649032106, |
|
"grad_norm": 0.4807054400444031, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.3663, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19732817649032106, |
|
"eval_loss": 0.317302405834198, |
|
"eval_runtime": 0.1849, |
|
"eval_samples_per_second": 27.036, |
|
"eval_steps_per_second": 5.407, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.20719458531483712, |
|
"grad_norm": 0.40431880950927734, |
|
"learning_rate": 0.000175, |
|
"loss": 0.3618, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.20719458531483712, |
|
"eval_loss": 0.30631712079048157, |
|
"eval_runtime": 0.1843, |
|
"eval_samples_per_second": 27.129, |
|
"eval_steps_per_second": 5.426, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.21706099413935315, |
|
"grad_norm": 0.3184848725795746, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.3585, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.21706099413935315, |
|
"eval_loss": 0.3047851026058197, |
|
"eval_runtime": 0.1842, |
|
"eval_samples_per_second": 27.139, |
|
"eval_steps_per_second": 5.428, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2269274029638692, |
|
"grad_norm": 0.29751336574554443, |
|
"learning_rate": 0.00019166666666666667, |
|
"loss": 0.3545, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2269274029638692, |
|
"eval_loss": 0.30102381110191345, |
|
"eval_runtime": 0.1859, |
|
"eval_samples_per_second": 26.901, |
|
"eval_steps_per_second": 5.38, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.23679381178838527, |
|
"grad_norm": 0.3572952449321747, |
|
"learning_rate": 0.0002, |
|
"loss": 0.354, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.23679381178838527, |
|
"eval_loss": 0.2969129979610443, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.03, |
|
"eval_steps_per_second": 5.406, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.24666022061290133, |
|
"grad_norm": 0.397197961807251, |
|
"learning_rate": 0.00019998454365958754, |
|
"loss": 0.3505, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.24666022061290133, |
|
"eval_loss": 0.2986981272697449, |
|
"eval_runtime": 0.1868, |
|
"eval_samples_per_second": 26.768, |
|
"eval_steps_per_second": 5.354, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.2565266294374174, |
|
"grad_norm": 0.31810271739959717, |
|
"learning_rate": 0.00019993817941631932, |
|
"loss": 0.3469, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2565266294374174, |
|
"eval_loss": 0.2904866635799408, |
|
"eval_runtime": 0.361, |
|
"eval_samples_per_second": 13.851, |
|
"eval_steps_per_second": 2.77, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.26639303826193345, |
|
"grad_norm": 0.3378036618232727, |
|
"learning_rate": 0.00019986092160262587, |
|
"loss": 0.3465, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.26639303826193345, |
|
"eval_loss": 0.29250431060791016, |
|
"eval_runtime": 0.1881, |
|
"eval_samples_per_second": 26.582, |
|
"eval_steps_per_second": 5.316, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.27625944708644945, |
|
"grad_norm": 0.25885072350502014, |
|
"learning_rate": 0.00019975279410096856, |
|
"loss": 0.3427, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.27625944708644945, |
|
"eval_loss": 0.2970002293586731, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 26.911, |
|
"eval_steps_per_second": 5.382, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2861258559109655, |
|
"grad_norm": 0.37681177258491516, |
|
"learning_rate": 0.00019961383033645683, |
|
"loss": 0.3423, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.2861258559109655, |
|
"eval_loss": 0.28687533736228943, |
|
"eval_runtime": 0.1869, |
|
"eval_samples_per_second": 26.751, |
|
"eval_steps_per_second": 5.35, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.29599226473548157, |
|
"grad_norm": 0.33231380581855774, |
|
"learning_rate": 0.00019944407326651575, |
|
"loss": 0.3388, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.29599226473548157, |
|
"eval_loss": 0.28474920988082886, |
|
"eval_runtime": 0.1871, |
|
"eval_samples_per_second": 26.718, |
|
"eval_steps_per_second": 5.344, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.30585867355999763, |
|
"grad_norm": 0.3391518294811249, |
|
"learning_rate": 0.00019924357536760644, |
|
"loss": 0.3316, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.30585867355999763, |
|
"eval_loss": 0.27619096636772156, |
|
"eval_runtime": 0.2, |
|
"eval_samples_per_second": 25.0, |
|
"eval_steps_per_second": 5.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.3157250823845137, |
|
"grad_norm": 0.3423280715942383, |
|
"learning_rate": 0.0001990123986190045, |
|
"loss": 0.3376, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3157250823845137, |
|
"eval_loss": 0.2841600179672241, |
|
"eval_runtime": 0.1876, |
|
"eval_samples_per_second": 26.647, |
|
"eval_steps_per_second": 5.329, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.32559149120902975, |
|
"grad_norm": 0.37309399247169495, |
|
"learning_rate": 0.00019875061448364033, |
|
"loss": 0.3357, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.32559149120902975, |
|
"eval_loss": 0.2811121344566345, |
|
"eval_runtime": 0.1875, |
|
"eval_samples_per_second": 26.669, |
|
"eval_steps_per_second": 5.334, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.3354579000335458, |
|
"grad_norm": 0.33649030327796936, |
|
"learning_rate": 0.00019845830388600822, |
|
"loss": 0.3294, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3354579000335458, |
|
"eval_loss": 0.27847614884376526, |
|
"eval_runtime": 0.1869, |
|
"eval_samples_per_second": 26.754, |
|
"eval_steps_per_second": 5.351, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.34532430885806187, |
|
"grad_norm": 0.38383936882019043, |
|
"learning_rate": 0.00019813555718715012, |
|
"loss": 0.334, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.34532430885806187, |
|
"eval_loss": 0.2873836159706116, |
|
"eval_runtime": 0.1865, |
|
"eval_samples_per_second": 26.806, |
|
"eval_steps_per_second": 5.361, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.3551907176825779, |
|
"grad_norm": 0.3089899718761444, |
|
"learning_rate": 0.000197782474156723, |
|
"loss": 0.3275, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3551907176825779, |
|
"eval_loss": 0.2808569669723511, |
|
"eval_runtime": 0.1864, |
|
"eval_samples_per_second": 26.824, |
|
"eval_steps_per_second": 5.365, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.36505712650709393, |
|
"grad_norm": 0.3028016686439514, |
|
"learning_rate": 0.0001973991639421571, |
|
"loss": 0.3303, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.36505712650709393, |
|
"eval_loss": 0.27654123306274414, |
|
"eval_runtime": 0.1859, |
|
"eval_samples_per_second": 26.894, |
|
"eval_steps_per_second": 5.379, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.37492353533161, |
|
"grad_norm": 0.2978042960166931, |
|
"learning_rate": 0.0001969857450349156, |
|
"loss": 0.3304, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.37492353533161, |
|
"eval_loss": 0.270946741104126, |
|
"eval_runtime": 0.1856, |
|
"eval_samples_per_second": 26.941, |
|
"eval_steps_per_second": 5.388, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.38478994415612605, |
|
"grad_norm": 0.3676602244377136, |
|
"learning_rate": 0.00019654234523386576, |
|
"loss": 0.3319, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.38478994415612605, |
|
"eval_loss": 0.26677393913269043, |
|
"eval_runtime": 0.1857, |
|
"eval_samples_per_second": 26.927, |
|
"eval_steps_per_second": 5.385, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.3946563529806421, |
|
"grad_norm": 0.243007093667984, |
|
"learning_rate": 0.00019606910160577286, |
|
"loss": 0.3253, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3946563529806421, |
|
"eval_loss": 0.26983901858329773, |
|
"eval_runtime": 0.186, |
|
"eval_samples_per_second": 26.888, |
|
"eval_steps_per_second": 5.378, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.40452276180515817, |
|
"grad_norm": 0.2672829329967499, |
|
"learning_rate": 0.00019556616044292917, |
|
"loss": 0.3269, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.40452276180515817, |
|
"eval_loss": 0.2751110792160034, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.74, |
|
"eval_steps_per_second": 5.348, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.41438917062967423, |
|
"grad_norm": 0.26455241441726685, |
|
"learning_rate": 0.00019503367721793112, |
|
"loss": 0.3269, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.41438917062967423, |
|
"eval_loss": 0.2611739933490753, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 26.854, |
|
"eval_steps_per_second": 5.371, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.4242555794541903, |
|
"grad_norm": 0.260708749294281, |
|
"learning_rate": 0.00019447181653561851, |
|
"loss": 0.3264, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.4242555794541903, |
|
"eval_loss": 0.25694364309310913, |
|
"eval_runtime": 0.1866, |
|
"eval_samples_per_second": 26.801, |
|
"eval_steps_per_second": 5.36, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.4341219882787063, |
|
"grad_norm": 0.3702332079410553, |
|
"learning_rate": 0.00019388075208219072, |
|
"loss": 0.3205, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4341219882787063, |
|
"eval_loss": 0.2542697489261627, |
|
"eval_runtime": 0.1859, |
|
"eval_samples_per_second": 26.897, |
|
"eval_steps_per_second": 5.379, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.44398839710322235, |
|
"grad_norm": 0.2797427177429199, |
|
"learning_rate": 0.00019326066657151568, |
|
"loss": 0.3251, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.44398839710322235, |
|
"eval_loss": 0.2655983865261078, |
|
"eval_runtime": 0.186, |
|
"eval_samples_per_second": 26.886, |
|
"eval_steps_per_second": 5.377, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.4538548059277384, |
|
"grad_norm": 0.27329573035240173, |
|
"learning_rate": 0.00019261175168864823, |
|
"loss": 0.3203, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.4538548059277384, |
|
"eval_loss": 0.2548620402812958, |
|
"eval_runtime": 0.1859, |
|
"eval_samples_per_second": 26.897, |
|
"eval_steps_per_second": 5.379, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.4637212147522545, |
|
"grad_norm": 0.2299811691045761, |
|
"learning_rate": 0.00019193420803057483, |
|
"loss": 0.3171, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.4637212147522545, |
|
"eval_loss": 0.25404179096221924, |
|
"eval_runtime": 0.1869, |
|
"eval_samples_per_second": 26.755, |
|
"eval_steps_per_second": 5.351, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.47358762357677053, |
|
"grad_norm": 0.43620172142982483, |
|
"learning_rate": 0.00019122824504420402, |
|
"loss": 0.3192, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.47358762357677053, |
|
"eval_loss": 0.2546399235725403, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 26.85, |
|
"eval_steps_per_second": 5.37, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4834540324012866, |
|
"grad_norm": 0.2562660574913025, |
|
"learning_rate": 0.0001904940809616205, |
|
"loss": 0.3204, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.4834540324012866, |
|
"eval_loss": 0.25131484866142273, |
|
"eval_runtime": 0.1861, |
|
"eval_samples_per_second": 26.861, |
|
"eval_steps_per_second": 5.372, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.49332044122580265, |
|
"grad_norm": 0.21843916177749634, |
|
"learning_rate": 0.0001897319427326239, |
|
"loss": 0.3211, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.49332044122580265, |
|
"eval_loss": 0.2540796399116516, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 26.851, |
|
"eval_steps_per_second": 5.37, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5031868500503187, |
|
"grad_norm": 0.2513650059700012, |
|
"learning_rate": 0.00018894206595457228, |
|
"loss": 0.3172, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.5031868500503187, |
|
"eval_loss": 0.254707932472229, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 26.851, |
|
"eval_steps_per_second": 5.37, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.5130532588748348, |
|
"grad_norm": 0.2910959720611572, |
|
"learning_rate": 0.00018812469479955306, |
|
"loss": 0.319, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5130532588748348, |
|
"eval_loss": 0.2549395263195038, |
|
"eval_runtime": 0.186, |
|
"eval_samples_per_second": 26.877, |
|
"eval_steps_per_second": 5.375, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5229196676993508, |
|
"grad_norm": 0.36300212144851685, |
|
"learning_rate": 0.00018728008193890248, |
|
"loss": 0.3141, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.5229196676993508, |
|
"eval_loss": 0.25633907318115234, |
|
"eval_runtime": 0.1866, |
|
"eval_samples_per_second": 26.789, |
|
"eval_steps_per_second": 5.358, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.5327860765238669, |
|
"grad_norm": 0.27469244599342346, |
|
"learning_rate": 0.00018640848846509836, |
|
"loss": 0.3165, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5327860765238669, |
|
"eval_loss": 0.25254935026168823, |
|
"eval_runtime": 0.1868, |
|
"eval_samples_per_second": 26.772, |
|
"eval_steps_per_second": 5.354, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5426524853483828, |
|
"grad_norm": 0.24992810189723969, |
|
"learning_rate": 0.00018551018381104942, |
|
"loss": 0.3148, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.5426524853483828, |
|
"eval_loss": 0.2522678077220917, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.739, |
|
"eval_steps_per_second": 5.348, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.5525188941728989, |
|
"grad_norm": 0.22494597733020782, |
|
"learning_rate": 0.00018458544566680613, |
|
"loss": 0.3178, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5525188941728989, |
|
"eval_loss": 0.2535417973995209, |
|
"eval_runtime": 0.1861, |
|
"eval_samples_per_second": 26.866, |
|
"eval_steps_per_second": 5.373, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.562385302997415, |
|
"grad_norm": 0.2611429989337921, |
|
"learning_rate": 0.0001836345598937195, |
|
"loss": 0.3151, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.562385302997415, |
|
"eval_loss": 0.2576417326927185, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 26.917, |
|
"eval_steps_per_second": 5.383, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.572251711821931, |
|
"grad_norm": 0.2593046724796295, |
|
"learning_rate": 0.00018265782043607362, |
|
"loss": 0.3155, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.572251711821931, |
|
"eval_loss": 0.24795055389404297, |
|
"eval_runtime": 0.1873, |
|
"eval_samples_per_second": 26.699, |
|
"eval_steps_per_second": 5.34, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5821181206464471, |
|
"grad_norm": 0.2805608808994293, |
|
"learning_rate": 0.00018165552923021946, |
|
"loss": 0.3084, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.5821181206464471, |
|
"eval_loss": 0.2520078122615814, |
|
"eval_runtime": 0.186, |
|
"eval_samples_per_second": 26.883, |
|
"eval_steps_per_second": 5.377, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.5919845294709631, |
|
"grad_norm": 0.2355504184961319, |
|
"learning_rate": 0.00018062799611123843, |
|
"loss": 0.3135, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5919845294709631, |
|
"eval_loss": 0.2500859797000885, |
|
"eval_runtime": 0.1863, |
|
"eval_samples_per_second": 26.834, |
|
"eval_steps_per_second": 5.367, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6018509382954792, |
|
"grad_norm": 0.23624199628829956, |
|
"learning_rate": 0.00017957553871716405, |
|
"loss": 0.3063, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.6018509382954792, |
|
"eval_loss": 0.24056918919086456, |
|
"eval_runtime": 0.1859, |
|
"eval_samples_per_second": 26.894, |
|
"eval_steps_per_second": 5.379, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.6117173471199953, |
|
"grad_norm": 0.2305937111377716, |
|
"learning_rate": 0.00017849848239079126, |
|
"loss": 0.3076, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6117173471199953, |
|
"eval_loss": 0.2453293353319168, |
|
"eval_runtime": 0.1852, |
|
"eval_samples_per_second": 26.996, |
|
"eval_steps_per_second": 5.399, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6215837559445113, |
|
"grad_norm": 0.28438571095466614, |
|
"learning_rate": 0.00017739716007910458, |
|
"loss": 0.309, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.6215837559445113, |
|
"eval_loss": 0.241606667637825, |
|
"eval_runtime": 0.1859, |
|
"eval_samples_per_second": 26.891, |
|
"eval_steps_per_second": 5.378, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.6314501647690274, |
|
"grad_norm": 0.31287047266960144, |
|
"learning_rate": 0.00017627191223035512, |
|
"loss": 0.3123, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6314501647690274, |
|
"eval_loss": 0.24313075840473175, |
|
"eval_runtime": 0.1856, |
|
"eval_samples_per_second": 26.939, |
|
"eval_steps_per_second": 5.388, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6413165735935434, |
|
"grad_norm": 0.2634814381599426, |
|
"learning_rate": 0.00017512308668881876, |
|
"loss": 0.3093, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.6413165735935434, |
|
"eval_loss": 0.24376794695854187, |
|
"eval_runtime": 0.1861, |
|
"eval_samples_per_second": 26.872, |
|
"eval_steps_per_second": 5.374, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.6511829824180595, |
|
"grad_norm": 0.22829458117485046, |
|
"learning_rate": 0.00017395103858726846, |
|
"loss": 0.3093, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6511829824180595, |
|
"eval_loss": 0.2428242713212967, |
|
"eval_runtime": 0.1881, |
|
"eval_samples_per_second": 26.589, |
|
"eval_steps_per_second": 5.318, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6610493912425756, |
|
"grad_norm": 0.26152193546295166, |
|
"learning_rate": 0.00017275613023719298, |
|
"loss": 0.3113, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.6610493912425756, |
|
"eval_loss": 0.24297213554382324, |
|
"eval_runtime": 0.1864, |
|
"eval_samples_per_second": 26.821, |
|
"eval_steps_per_second": 5.364, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.6709158000670916, |
|
"grad_norm": 0.21779882907867432, |
|
"learning_rate": 0.00017153873101679668, |
|
"loss": 0.3085, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6709158000670916, |
|
"eval_loss": 0.24597088992595673, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 26.854, |
|
"eval_steps_per_second": 5.371, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6807822088916077, |
|
"grad_norm": 0.21129541099071503, |
|
"learning_rate": 0.00017029921725681492, |
|
"loss": 0.3045, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.6807822088916077, |
|
"eval_loss": 0.23454919457435608, |
|
"eval_runtime": 0.1867, |
|
"eval_samples_per_second": 26.786, |
|
"eval_steps_per_second": 5.357, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.6906486177161237, |
|
"grad_norm": 0.19822391867637634, |
|
"learning_rate": 0.00016903797212418015, |
|
"loss": 0.3049, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6906486177161237, |
|
"eval_loss": 0.23835961520671844, |
|
"eval_runtime": 0.1872, |
|
"eval_samples_per_second": 26.708, |
|
"eval_steps_per_second": 5.342, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.7005150265406397, |
|
"grad_norm": 0.2132054716348648, |
|
"learning_rate": 0.00016775538550357463, |
|
"loss": 0.3058, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.7005150265406397, |
|
"eval_loss": 0.2351282387971878, |
|
"eval_runtime": 0.1862, |
|
"eval_samples_per_second": 26.857, |
|
"eval_steps_per_second": 5.371, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.7103814353651557, |
|
"grad_norm": 0.23732738196849823, |
|
"learning_rate": 0.0001664518538769067, |
|
"loss": 0.3021, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7103814353651557, |
|
"eval_loss": 0.23832817375659943, |
|
"eval_runtime": 0.1857, |
|
"eval_samples_per_second": 26.93, |
|
"eval_steps_per_second": 5.386, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7202478441896718, |
|
"grad_norm": 0.3427148759365082, |
|
"learning_rate": 0.00016512778020074753, |
|
"loss": 0.3044, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.7202478441896718, |
|
"eval_loss": 0.24248237907886505, |
|
"eval_runtime": 0.1861, |
|
"eval_samples_per_second": 26.863, |
|
"eval_steps_per_second": 5.373, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.7301142530141879, |
|
"grad_norm": 0.33537736535072327, |
|
"learning_rate": 0.00016378357378176654, |
|
"loss": 0.3004, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7301142530141879, |
|
"eval_loss": 0.23529915511608124, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.026, |
|
"eval_steps_per_second": 5.405, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7399806618387039, |
|
"grad_norm": 0.236049085855484, |
|
"learning_rate": 0.00016241965015020363, |
|
"loss": 0.3047, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.7399806618387039, |
|
"eval_loss": 0.23639431595802307, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 26.914, |
|
"eval_steps_per_second": 5.383, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.74984707066322, |
|
"grad_norm": 0.23827500641345978, |
|
"learning_rate": 0.0001610364309314178, |
|
"loss": 0.2991, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.74984707066322, |
|
"eval_loss": 0.23711517453193665, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.098, |
|
"eval_steps_per_second": 5.42, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.759713479487736, |
|
"grad_norm": 0.24710151553153992, |
|
"learning_rate": 0.00015963434371555116, |
|
"loss": 0.2993, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.759713479487736, |
|
"eval_loss": 0.2302863895893097, |
|
"eval_runtime": 0.1857, |
|
"eval_samples_per_second": 26.929, |
|
"eval_steps_per_second": 5.386, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.7695798883122521, |
|
"grad_norm": 0.18992306292057037, |
|
"learning_rate": 0.00015821382192534968, |
|
"loss": 0.3022, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7695798883122521, |
|
"eval_loss": 0.23323240876197815, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.071, |
|
"eval_steps_per_second": 5.414, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7794462971367682, |
|
"grad_norm": 0.2224888652563095, |
|
"learning_rate": 0.00015677530468218045, |
|
"loss": 0.3028, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.7794462971367682, |
|
"eval_loss": 0.2351154088973999, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.091, |
|
"eval_steps_per_second": 5.418, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.7893127059612842, |
|
"grad_norm": 0.2259168177843094, |
|
"learning_rate": 0.0001553192366702874, |
|
"loss": 0.3019, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7893127059612842, |
|
"eval_loss": 0.227354496717453, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.09, |
|
"eval_steps_per_second": 5.418, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7991791147858003, |
|
"grad_norm": 0.291294127702713, |
|
"learning_rate": 0.0001538460679993277, |
|
"loss": 0.2989, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.7991791147858003, |
|
"eval_loss": 0.24032792448997498, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.022, |
|
"eval_steps_per_second": 5.404, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.8090455236103163, |
|
"grad_norm": 0.23510493338108063, |
|
"learning_rate": 0.00015235625406523058, |
|
"loss": 0.3004, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8090455236103163, |
|
"eval_loss": 0.2415897101163864, |
|
"eval_runtime": 0.1848, |
|
"eval_samples_per_second": 27.05, |
|
"eval_steps_per_second": 5.41, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8189119324348324, |
|
"grad_norm": 0.21880796551704407, |
|
"learning_rate": 0.00015085025540942238, |
|
"loss": 0.3, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.8189119324348324, |
|
"eval_loss": 0.24045029282569885, |
|
"eval_runtime": 0.1849, |
|
"eval_samples_per_second": 27.045, |
|
"eval_steps_per_second": 5.409, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.8287783412593485, |
|
"grad_norm": 0.24291107058525085, |
|
"learning_rate": 0.0001493285375764608, |
|
"loss": 0.2976, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8287783412593485, |
|
"eval_loss": 0.24324026703834534, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.067, |
|
"eval_steps_per_second": 5.413, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8386447500838645, |
|
"grad_norm": 0.2263229787349701, |
|
"learning_rate": 0.0001477915709701226, |
|
"loss": 0.298, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.8386447500838645, |
|
"eval_loss": 0.2447391003370285, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.068, |
|
"eval_steps_per_second": 5.414, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.8485111589083806, |
|
"grad_norm": 0.20994020998477936, |
|
"learning_rate": 0.00014623983070798918, |
|
"loss": 0.297, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8485111589083806, |
|
"eval_loss": 0.24018962681293488, |
|
"eval_runtime": 0.1843, |
|
"eval_samples_per_second": 27.127, |
|
"eval_steps_per_second": 5.425, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8583775677328965, |
|
"grad_norm": 0.19051186740398407, |
|
"learning_rate": 0.000144673796474575, |
|
"loss": 0.2929, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.8583775677328965, |
|
"eval_loss": 0.23791976273059845, |
|
"eval_runtime": 0.1849, |
|
"eval_samples_per_second": 27.037, |
|
"eval_steps_per_second": 5.407, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.8682439765574126, |
|
"grad_norm": 0.23745004832744598, |
|
"learning_rate": 0.00014309395237304426, |
|
"loss": 0.2989, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8682439765574126, |
|
"eval_loss": 0.23674488067626953, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 26.904, |
|
"eval_steps_per_second": 5.381, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8781103853819286, |
|
"grad_norm": 0.19539549946784973, |
|
"learning_rate": 0.0001415007867755616, |
|
"loss": 0.2972, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.8781103853819286, |
|
"eval_loss": 0.23721548914909363, |
|
"eval_runtime": 0.1843, |
|
"eval_samples_per_second": 27.128, |
|
"eval_steps_per_second": 5.426, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.8879767942064447, |
|
"grad_norm": 0.26372581720352173, |
|
"learning_rate": 0.00013989479217232315, |
|
"loss": 0.2989, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8879767942064447, |
|
"eval_loss": 0.2338230162858963, |
|
"eval_runtime": 0.1851, |
|
"eval_samples_per_second": 27.014, |
|
"eval_steps_per_second": 5.403, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8978432030309608, |
|
"grad_norm": 0.2618955373764038, |
|
"learning_rate": 0.00013827646501931472, |
|
"loss": 0.2896, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.8978432030309608, |
|
"eval_loss": 0.23268970847129822, |
|
"eval_runtime": 0.1842, |
|
"eval_samples_per_second": 27.148, |
|
"eval_steps_per_second": 5.43, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.9077096118554768, |
|
"grad_norm": 0.2664230167865753, |
|
"learning_rate": 0.00013664630558484379, |
|
"loss": 0.2889, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.9077096118554768, |
|
"eval_loss": 0.22873499989509583, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.094, |
|
"eval_steps_per_second": 5.419, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.9175760206799929, |
|
"grad_norm": 0.21087364852428436, |
|
"learning_rate": 0.00013500481779489326, |
|
"loss": 0.2958, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.9175760206799929, |
|
"eval_loss": 0.23380012810230255, |
|
"eval_runtime": 0.1848, |
|
"eval_samples_per_second": 27.061, |
|
"eval_steps_per_second": 5.412, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.927442429504509, |
|
"grad_norm": 0.21377532184123993, |
|
"learning_rate": 0.00013335250907734448, |
|
"loss": 0.296, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.927442429504509, |
|
"eval_loss": 0.23471400141716003, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.075, |
|
"eval_steps_per_second": 5.415, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.937308838329025, |
|
"grad_norm": 0.2032233327627182, |
|
"learning_rate": 0.0001316898902051175, |
|
"loss": 0.2927, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.937308838329025, |
|
"eval_loss": 0.23120686411857605, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.026, |
|
"eval_steps_per_second": 5.405, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.9471752471535411, |
|
"grad_norm": 0.22421005368232727, |
|
"learning_rate": 0.00013001747513827764, |
|
"loss": 0.2956, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9471752471535411, |
|
"eval_loss": 0.23076245188713074, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.079, |
|
"eval_steps_per_second": 5.416, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9570416559780571, |
|
"grad_norm": 0.2466699182987213, |
|
"learning_rate": 0.0001283357808651566, |
|
"loss": 0.2907, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.9570416559780571, |
|
"eval_loss": 0.22873035073280334, |
|
"eval_runtime": 0.1843, |
|
"eval_samples_per_second": 27.128, |
|
"eval_steps_per_second": 5.426, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.9669080648025732, |
|
"grad_norm": 0.20787620544433594, |
|
"learning_rate": 0.00012664532724253745, |
|
"loss": 0.2954, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9669080648025732, |
|
"eval_loss": 0.22716188430786133, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.076, |
|
"eval_steps_per_second": 5.415, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9767744736270892, |
|
"grad_norm": 0.22663253545761108, |
|
"learning_rate": 0.00012494663683495304, |
|
"loss": 0.2897, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.9767744736270892, |
|
"eval_loss": 0.2276424914598465, |
|
"eval_runtime": 0.1848, |
|
"eval_samples_per_second": 27.05, |
|
"eval_steps_per_second": 5.41, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.9866408824516053, |
|
"grad_norm": 0.20295187830924988, |
|
"learning_rate": 0.00012324023475314725, |
|
"loss": 0.2958, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9866408824516053, |
|
"eval_loss": 0.23117908835411072, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.024, |
|
"eval_steps_per_second": 5.405, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9965072912761214, |
|
"grad_norm": 0.2138393521308899, |
|
"learning_rate": 0.0001215266484917493, |
|
"loss": 0.2876, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.9965072912761214, |
|
"eval_loss": 0.22664275765419006, |
|
"eval_runtime": 0.1852, |
|
"eval_samples_per_second": 27.002, |
|
"eval_steps_per_second": 5.4, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.0063737001006374, |
|
"grad_norm": 0.1731952726840973, |
|
"learning_rate": 0.00011980640776621077, |
|
"loss": 0.2843, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.0063737001006374, |
|
"eval_loss": 0.22769823670387268, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.027, |
|
"eval_steps_per_second": 5.405, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.0162401089251534, |
|
"grad_norm": 0.21639849245548248, |
|
"learning_rate": 0.0001180800443490566, |
|
"loss": 0.2744, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.0162401089251534, |
|
"eval_loss": 0.23013372719287872, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 27.117, |
|
"eval_steps_per_second": 5.423, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.0261065177496695, |
|
"grad_norm": 0.2546159625053406, |
|
"learning_rate": 0.0001163480919054998, |
|
"loss": 0.2745, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.0261065177496695, |
|
"eval_loss": 0.22475233674049377, |
|
"eval_runtime": 0.1858, |
|
"eval_samples_per_second": 26.911, |
|
"eval_steps_per_second": 5.382, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.0359729265741855, |
|
"grad_norm": 0.21476522088050842, |
|
"learning_rate": 0.00011461108582847126, |
|
"loss": 0.2734, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.0359729265741855, |
|
"eval_loss": 0.22389909625053406, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.07, |
|
"eval_steps_per_second": 5.414, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.0458393353987017, |
|
"grad_norm": 0.21301063895225525, |
|
"learning_rate": 0.00011286956307311555, |
|
"loss": 0.2703, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.0458393353987017, |
|
"eval_loss": 0.2267945110797882, |
|
"eval_runtime": 0.1843, |
|
"eval_samples_per_second": 27.135, |
|
"eval_steps_per_second": 5.427, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.0557057442232176, |
|
"grad_norm": 0.221356600522995, |
|
"learning_rate": 0.00011112406199080346, |
|
"loss": 0.2717, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.0557057442232176, |
|
"eval_loss": 0.2249325066804886, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.078, |
|
"eval_steps_per_second": 5.416, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.0655721530477338, |
|
"grad_norm": 0.22443543374538422, |
|
"learning_rate": 0.00010937512216271338, |
|
"loss": 0.2705, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.0655721530477338, |
|
"eval_loss": 0.2256511151790619, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.097, |
|
"eval_steps_per_second": 5.419, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.0754385618722497, |
|
"grad_norm": 0.20267891883850098, |
|
"learning_rate": 0.00010762328423303218, |
|
"loss": 0.2735, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.0754385618722497, |
|
"eval_loss": 0.22567923367023468, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.086, |
|
"eval_steps_per_second": 5.417, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.085304970696766, |
|
"grad_norm": 0.21884013712406158, |
|
"learning_rate": 0.00010586908974182767, |
|
"loss": 0.2756, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.085304970696766, |
|
"eval_loss": 0.22433491051197052, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.077, |
|
"eval_steps_per_second": 5.415, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.0951713795212819, |
|
"grad_norm": 0.21301575005054474, |
|
"learning_rate": 0.00010411308095764393, |
|
"loss": 0.2714, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.0951713795212819, |
|
"eval_loss": 0.22806711494922638, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 27.113, |
|
"eval_steps_per_second": 5.423, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.1050377883457978, |
|
"grad_norm": 0.3255505859851837, |
|
"learning_rate": 0.0001023558007098717, |
|
"loss": 0.2715, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.1050377883457978, |
|
"eval_loss": 0.22649523615837097, |
|
"eval_runtime": 0.1848, |
|
"eval_samples_per_second": 27.059, |
|
"eval_steps_per_second": 5.412, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.114904197170314, |
|
"grad_norm": 0.30551499128341675, |
|
"learning_rate": 0.00010059779222094512, |
|
"loss": 0.2737, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.114904197170314, |
|
"eval_loss": 0.2232808619737625, |
|
"eval_runtime": 0.1871, |
|
"eval_samples_per_second": 26.725, |
|
"eval_steps_per_second": 5.345, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.12477060599483, |
|
"grad_norm": 0.23381146788597107, |
|
"learning_rate": 9.88395989384173e-05, |
|
"loss": 0.2709, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.12477060599483, |
|
"eval_loss": 0.22571022808551788, |
|
"eval_runtime": 0.1849, |
|
"eval_samples_per_second": 27.039, |
|
"eval_steps_per_second": 5.408, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.134637014819346, |
|
"grad_norm": 0.20602265000343323, |
|
"learning_rate": 9.708176436696595e-05, |
|
"loss": 0.2721, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.134637014819346, |
|
"eval_loss": 0.2223341017961502, |
|
"eval_runtime": 0.1843, |
|
"eval_samples_per_second": 27.136, |
|
"eval_steps_per_second": 5.427, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.144503423643862, |
|
"grad_norm": 0.210631862282753, |
|
"learning_rate": 9.532483190038153e-05, |
|
"loss": 0.2679, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.144503423643862, |
|
"eval_loss": 0.2227615863084793, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.02, |
|
"eval_steps_per_second": 5.404, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.1543698324683782, |
|
"grad_norm": 0.2317020148038864, |
|
"learning_rate": 9.356934465358979e-05, |
|
"loss": 0.272, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.1543698324683782, |
|
"eval_loss": 0.2202722728252411, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.087, |
|
"eval_steps_per_second": 5.417, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.1642362412928942, |
|
"grad_norm": 0.2153208702802658, |
|
"learning_rate": 9.181584529476025e-05, |
|
"loss": 0.2705, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.1642362412928942, |
|
"eval_loss": 0.2222643345594406, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.105, |
|
"eval_steps_per_second": 5.421, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.1741026501174103, |
|
"grad_norm": 0.26965272426605225, |
|
"learning_rate": 9.006487587755295e-05, |
|
"loss": 0.2699, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.1741026501174103, |
|
"eval_loss": 0.218417689204216, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.091, |
|
"eval_steps_per_second": 5.418, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.1839690589419263, |
|
"grad_norm": 0.21898072957992554, |
|
"learning_rate": 8.831697767355519e-05, |
|
"loss": 0.2698, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.1839690589419263, |
|
"eval_loss": 0.22147616744041443, |
|
"eval_runtime": 0.1855, |
|
"eval_samples_per_second": 26.955, |
|
"eval_steps_per_second": 5.391, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.1938354677664424, |
|
"grad_norm": 0.19538186490535736, |
|
"learning_rate": 8.65726910049599e-05, |
|
"loss": 0.2683, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.1938354677664424, |
|
"eval_loss": 0.21991392970085144, |
|
"eval_runtime": 0.1851, |
|
"eval_samples_per_second": 27.012, |
|
"eval_steps_per_second": 5.402, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.2037018765909584, |
|
"grad_norm": 0.21497651934623718, |
|
"learning_rate": 8.483255507753762e-05, |
|
"loss": 0.2696, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.2037018765909584, |
|
"eval_loss": 0.22209680080413818, |
|
"eval_runtime": 0.1849, |
|
"eval_samples_per_second": 27.037, |
|
"eval_steps_per_second": 5.407, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.2135682854154746, |
|
"grad_norm": 0.2190115749835968, |
|
"learning_rate": 8.309710781395356e-05, |
|
"loss": 0.2676, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.2135682854154746, |
|
"eval_loss": 0.22349147498607635, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.1, |
|
"eval_steps_per_second": 5.42, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.2234346942399905, |
|
"grad_norm": 0.18856999278068542, |
|
"learning_rate": 8.136688568748113e-05, |
|
"loss": 0.2642, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.2234346942399905, |
|
"eval_loss": 0.2215404212474823, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 27.115, |
|
"eval_steps_per_second": 5.423, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.2333011030645067, |
|
"grad_norm": 0.2113649994134903, |
|
"learning_rate": 7.964242355616385e-05, |
|
"loss": 0.2702, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.2333011030645067, |
|
"eval_loss": 0.22224430739879608, |
|
"eval_runtime": 0.1851, |
|
"eval_samples_per_second": 27.018, |
|
"eval_steps_per_second": 5.404, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.2433056416125658, |
|
"grad_norm": 0.23301884531974792, |
|
"learning_rate": 7.792425449747635e-05, |
|
"loss": 0.2701, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.2433056416125658, |
|
"eval_loss": 0.22047898173332214, |
|
"eval_runtime": 0.1979, |
|
"eval_samples_per_second": 25.262, |
|
"eval_steps_per_second": 5.052, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.2531720504370818, |
|
"grad_norm": 0.211432546377182, |
|
"learning_rate": 7.621290964353581e-05, |
|
"loss": 0.2696, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.2531720504370818, |
|
"eval_loss": 0.22401615977287292, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.079, |
|
"eval_steps_per_second": 5.416, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.263038459261598, |
|
"grad_norm": 0.2142704576253891, |
|
"learning_rate": 7.450891801691468e-05, |
|
"loss": 0.2671, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.263038459261598, |
|
"eval_loss": 0.2236458957195282, |
|
"eval_runtime": 0.1853, |
|
"eval_samples_per_second": 26.977, |
|
"eval_steps_per_second": 5.395, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.2729048680861141, |
|
"grad_norm": 0.1931128203868866, |
|
"learning_rate": 7.281280636710576e-05, |
|
"loss": 0.269, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.2729048680861141, |
|
"eval_loss": 0.22148045897483826, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.085, |
|
"eval_steps_per_second": 5.417, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.28277127691063, |
|
"grad_norm": 0.2120596170425415, |
|
"learning_rate": 7.112509900768989e-05, |
|
"loss": 0.2692, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.28277127691063, |
|
"eval_loss": 0.2202589511871338, |
|
"eval_runtime": 0.1852, |
|
"eval_samples_per_second": 27.003, |
|
"eval_steps_per_second": 5.401, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.292637685735146, |
|
"grad_norm": 0.20692607760429382, |
|
"learning_rate": 6.944631765425629e-05, |
|
"loss": 0.2647, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.292637685735146, |
|
"eval_loss": 0.2197343409061432, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.031, |
|
"eval_steps_per_second": 5.406, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.3025040945596622, |
|
"grad_norm": 0.2301376610994339, |
|
"learning_rate": 6.777698126312647e-05, |
|
"loss": 0.2647, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.3025040945596622, |
|
"eval_loss": 0.2176646739244461, |
|
"eval_runtime": 0.1857, |
|
"eval_samples_per_second": 26.923, |
|
"eval_steps_per_second": 5.385, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.3123705033841782, |
|
"grad_norm": 0.2805488705635071, |
|
"learning_rate": 6.611760587093094e-05, |
|
"loss": 0.2635, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.3123705033841782, |
|
"eval_loss": 0.21856221556663513, |
|
"eval_runtime": 0.185, |
|
"eval_samples_per_second": 27.033, |
|
"eval_steps_per_second": 5.407, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.3222369122086943, |
|
"grad_norm": 0.2176360934972763, |
|
"learning_rate": 6.446870443508839e-05, |
|
"loss": 0.2628, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.3222369122086943, |
|
"eval_loss": 0.2163912057876587, |
|
"eval_runtime": 0.1848, |
|
"eval_samples_per_second": 27.054, |
|
"eval_steps_per_second": 5.411, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.3321033210332103, |
|
"grad_norm": 0.20765335857868195, |
|
"learning_rate": 6.283078667523681e-05, |
|
"loss": 0.2678, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.3321033210332103, |
|
"eval_loss": 0.2164166420698166, |
|
"eval_runtime": 0.1851, |
|
"eval_samples_per_second": 27.017, |
|
"eval_steps_per_second": 5.403, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.3419697298577264, |
|
"grad_norm": 0.2131153792142868, |
|
"learning_rate": 6.120435891566542e-05, |
|
"loss": 0.266, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.3419697298577264, |
|
"eval_loss": 0.21605464816093445, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.077, |
|
"eval_steps_per_second": 5.415, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.3518361386822424, |
|
"grad_norm": 0.20908519625663757, |
|
"learning_rate": 5.9589923928796434e-05, |
|
"loss": 0.2646, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.3518361386822424, |
|
"eval_loss": 0.21923890709877014, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.068, |
|
"eval_steps_per_second": 5.414, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.3617025475067586, |
|
"grad_norm": 0.20536689460277557, |
|
"learning_rate": 5.7987980779764463e-05, |
|
"loss": 0.2631, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.3617025475067586, |
|
"eval_loss": 0.2201545685529709, |
|
"eval_runtime": 0.1848, |
|
"eval_samples_per_second": 27.055, |
|
"eval_steps_per_second": 5.411, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.3715689563312745, |
|
"grad_norm": 0.21380028128623962, |
|
"learning_rate": 5.639902467214212e-05, |
|
"loss": 0.2651, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.3715689563312745, |
|
"eval_loss": 0.2177874594926834, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.069, |
|
"eval_steps_per_second": 5.414, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.3814353651557907, |
|
"grad_norm": 0.2062164545059204, |
|
"learning_rate": 5.482354679485948e-05, |
|
"loss": 0.2715, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.3814353651557907, |
|
"eval_loss": 0.21779461205005646, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.086, |
|
"eval_steps_per_second": 5.417, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.3913017739803066, |
|
"grad_norm": 0.22111141681671143, |
|
"learning_rate": 5.326203417036413e-05, |
|
"loss": 0.2601, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.3913017739803066, |
|
"eval_loss": 0.2183108627796173, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.097, |
|
"eval_steps_per_second": 5.419, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.4011681828048226, |
|
"grad_norm": 0.2018529623746872, |
|
"learning_rate": 5.17149695040698e-05, |
|
"loss": 0.2658, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.4011681828048226, |
|
"eval_loss": 0.2196209877729416, |
|
"eval_runtime": 0.1848, |
|
"eval_samples_per_second": 27.049, |
|
"eval_steps_per_second": 5.41, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.4110345916293388, |
|
"grad_norm": 0.2138175666332245, |
|
"learning_rate": 5.01828310351389e-05, |
|
"loss": 0.2647, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.4110345916293388, |
|
"eval_loss": 0.21882668137550354, |
|
"eval_runtime": 0.1844, |
|
"eval_samples_per_second": 27.118, |
|
"eval_steps_per_second": 5.424, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.420901000453855, |
|
"grad_norm": 0.21395103633403778, |
|
"learning_rate": 4.866609238864609e-05, |
|
"loss": 0.2614, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.420901000453855, |
|
"eval_loss": 0.21599645912647247, |
|
"eval_runtime": 0.1865, |
|
"eval_samples_per_second": 26.812, |
|
"eval_steps_per_second": 5.362, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.4307674092783709, |
|
"grad_norm": 0.19431644678115845, |
|
"learning_rate": 4.7165222429168156e-05, |
|
"loss": 0.2631, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.4307674092783709, |
|
"eval_loss": 0.21445360779762268, |
|
"eval_runtime": 0.1853, |
|
"eval_samples_per_second": 26.984, |
|
"eval_steps_per_second": 5.397, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.4406338181028868, |
|
"grad_norm": 0.2003539353609085, |
|
"learning_rate": 4.568068511584529e-05, |
|
"loss": 0.2619, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.4406338181028868, |
|
"eval_loss": 0.21582575142383575, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.092, |
|
"eval_steps_per_second": 5.418, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.450500226927403, |
|
"grad_norm": 0.2108386605978012, |
|
"learning_rate": 4.421293935895882e-05, |
|
"loss": 0.2618, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.450500226927403, |
|
"eval_loss": 0.21553893387317657, |
|
"eval_runtime": 0.1856, |
|
"eval_samples_per_second": 26.942, |
|
"eval_steps_per_second": 5.388, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.460366635751919, |
|
"grad_norm": 0.2119232714176178, |
|
"learning_rate": 4.2762438878069955e-05, |
|
"loss": 0.2667, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.460366635751919, |
|
"eval_loss": 0.21502795815467834, |
|
"eval_runtime": 0.1851, |
|
"eval_samples_per_second": 27.012, |
|
"eval_steps_per_second": 5.402, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.470233044576435, |
|
"grad_norm": 0.2125096172094345, |
|
"learning_rate": 4.13296320617627e-05, |
|
"loss": 0.2625, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.470233044576435, |
|
"eval_loss": 0.2130458652973175, |
|
"eval_runtime": 0.1857, |
|
"eval_samples_per_second": 26.926, |
|
"eval_steps_per_second": 5.385, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.480099453400951, |
|
"grad_norm": 0.20715288817882538, |
|
"learning_rate": 3.991496182903498e-05, |
|
"loss": 0.2607, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.480099453400951, |
|
"eval_loss": 0.2125515192747116, |
|
"eval_runtime": 0.1855, |
|
"eval_samples_per_second": 26.953, |
|
"eval_steps_per_second": 5.391, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.4899658622254672, |
|
"grad_norm": 0.19759340584278107, |
|
"learning_rate": 3.851886549238062e-05, |
|
"loss": 0.2596, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 1.4899658622254672, |
|
"eval_loss": 0.2126692235469818, |
|
"eval_runtime": 0.1848, |
|
"eval_samples_per_second": 27.05, |
|
"eval_steps_per_second": 5.41, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 1.4998322710499832, |
|
"grad_norm": 0.1995980143547058, |
|
"learning_rate": 3.714177462260412e-05, |
|
"loss": 0.263, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.4998322710499832, |
|
"eval_loss": 0.2116161286830902, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.091, |
|
"eval_steps_per_second": 5.418, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.5096986798744991, |
|
"grad_norm": 0.23178283870220184, |
|
"learning_rate": 3.578411491541079e-05, |
|
"loss": 0.2623, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 1.5096986798744991, |
|
"eval_loss": 0.21194568276405334, |
|
"eval_runtime": 0.1847, |
|
"eval_samples_per_second": 27.075, |
|
"eval_steps_per_second": 5.415, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 1.5195650886990153, |
|
"grad_norm": 0.2182031124830246, |
|
"learning_rate": 3.444630605981256e-05, |
|
"loss": 0.2558, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.5195650886990153, |
|
"eval_loss": 0.21379950642585754, |
|
"eval_runtime": 0.1846, |
|
"eval_samples_per_second": 27.079, |
|
"eval_steps_per_second": 5.416, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.5294314975235315, |
|
"grad_norm": 0.2095683068037033, |
|
"learning_rate": 3.312876160839099e-05, |
|
"loss": 0.261, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 1.5294314975235315, |
|
"eval_loss": 0.21340413391590118, |
|
"eval_runtime": 0.1852, |
|
"eval_samples_per_second": 26.992, |
|
"eval_steps_per_second": 5.398, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 1.5392979063480474, |
|
"grad_norm": 0.2217652052640915, |
|
"learning_rate": 3.183188884945714e-05, |
|
"loss": 0.2615, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.5392979063480474, |
|
"eval_loss": 0.21244795620441437, |
|
"eval_runtime": 0.1841, |
|
"eval_samples_per_second": 27.163, |
|
"eval_steps_per_second": 5.433, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.5491643151725634, |
|
"grad_norm": 0.21375681459903717, |
|
"learning_rate": 3.055608868114761e-05, |
|
"loss": 0.2571, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 1.5491643151725634, |
|
"eval_loss": 0.21376018226146698, |
|
"eval_runtime": 0.1845, |
|
"eval_samples_per_second": 27.101, |
|
"eval_steps_per_second": 5.42, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 1.5590307239970795, |
|
"grad_norm": 0.22278694808483124, |
|
"learning_rate": 2.930175548749645e-05, |
|
"loss": 0.2614, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.5590307239970795, |
|
"eval_loss": 0.21383312344551086, |
|
"eval_runtime": 0.1851, |
|
"eval_samples_per_second": 27.013, |
|
"eval_steps_per_second": 5.403, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.5688971328215957, |
|
"grad_norm": 0.2054503709077835, |
|
"learning_rate": 2.806927701652029e-05, |
|
"loss": 0.2628, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 1.5688971328215957, |
|
"eval_loss": 0.21245412528514862, |
|
"eval_runtime": 0.1849, |
|
"eval_samples_per_second": 27.044, |
|
"eval_steps_per_second": 5.409, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 1.5787635416461117, |
|
"grad_norm": 0.20330142974853516, |
|
"learning_rate": 2.6859034260355042e-05, |
|
"loss": 0.2587, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.5787635416461117, |
|
"eval_loss": 0.21231678128242493, |
|
"eval_runtime": 0.1841, |
|
"eval_samples_per_second": 27.158, |
|
"eval_steps_per_second": 5.432, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.5886299504706276, |
|
"grad_norm": 0.20486824214458466, |
|
"learning_rate": 2.567140133748118e-05, |
|
"loss": 0.2584, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 1.5886299504706276, |
|
"eval_loss": 0.2128223478794098, |
|
"eval_runtime": 0.2002, |
|
"eval_samples_per_second": 24.971, |
|
"eval_steps_per_second": 4.994, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 1.5984963592951438, |
|
"grad_norm": 0.21578329801559448, |
|
"learning_rate": 2.4506745377073535e-05, |
|
"loss": 0.2581, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.5984963592951438, |
|
"eval_loss": 0.2118312120437622, |
|
"eval_runtime": 0.1883, |
|
"eval_samples_per_second": 26.558, |
|
"eval_steps_per_second": 5.312, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.60836276811966, |
|
"grad_norm": 0.21013592183589935, |
|
"learning_rate": 2.3365426405511802e-05, |
|
"loss": 0.2613, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 1.60836276811966, |
|
"eval_loss": 0.21296799182891846, |
|
"eval_runtime": 0.1872, |
|
"eval_samples_per_second": 26.709, |
|
"eval_steps_per_second": 5.342, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 1.618229176944176, |
|
"grad_norm": 0.22792735695838928, |
|
"learning_rate": 2.224779723508692e-05, |
|
"loss": 0.2592, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.618229176944176, |
|
"eval_loss": 0.2124718874692917, |
|
"eval_runtime": 0.1884, |
|
"eval_samples_per_second": 26.537, |
|
"eval_steps_per_second": 5.307, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.6280955857686918, |
|
"grad_norm": 0.2071530818939209, |
|
"learning_rate": 2.1154203354937074e-05, |
|
"loss": 0.2589, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 1.6280955857686918, |
|
"eval_loss": 0.21143141388893127, |
|
"eval_runtime": 0.1867, |
|
"eval_samples_per_second": 26.788, |
|
"eval_steps_per_second": 5.358, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 1.637961994593208, |
|
"grad_norm": 0.2163563370704651, |
|
"learning_rate": 2.0084982824248034e-05, |
|
"loss": 0.2591, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.637961994593208, |
|
"eval_loss": 0.2110225409269333, |
|
"eval_runtime": 0.188, |
|
"eval_samples_per_second": 26.59, |
|
"eval_steps_per_second": 5.318, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.647828403417724, |
|
"grad_norm": 0.1936330944299698, |
|
"learning_rate": 1.9040466167749727e-05, |
|
"loss": 0.2574, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 1.647828403417724, |
|
"eval_loss": 0.21078595519065857, |
|
"eval_runtime": 0.1871, |
|
"eval_samples_per_second": 26.723, |
|
"eval_steps_per_second": 5.345, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 1.65769481224224, |
|
"grad_norm": 0.20487622916698456, |
|
"learning_rate": 1.802097627354231e-05, |
|
"loss": 0.2602, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.65769481224224, |
|
"eval_loss": 0.21095280349254608, |
|
"eval_runtime": 0.1863, |
|
"eval_samples_per_second": 26.838, |
|
"eval_steps_per_second": 5.368, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.667561221066756, |
|
"grad_norm": 0.20645365118980408, |
|
"learning_rate": 1.7026828293282892e-05, |
|
"loss": 0.2607, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 1.667561221066756, |
|
"eval_loss": 0.2102348506450653, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.732, |
|
"eval_steps_per_second": 5.346, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 1.6774276298912723, |
|
"grad_norm": 0.21055789291858673, |
|
"learning_rate": 1.605832954476346e-05, |
|
"loss": 0.2567, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.6774276298912723, |
|
"eval_loss": 0.2107050120830536, |
|
"eval_runtime": 0.1878, |
|
"eval_samples_per_second": 26.618, |
|
"eval_steps_per_second": 5.324, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.6872940387157882, |
|
"grad_norm": 0.1864519864320755, |
|
"learning_rate": 1.5115779416911014e-05, |
|
"loss": 0.2568, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 1.6872940387157882, |
|
"eval_loss": 0.21016505360603333, |
|
"eval_runtime": 0.1867, |
|
"eval_samples_per_second": 26.788, |
|
"eval_steps_per_second": 5.358, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 1.6971604475403042, |
|
"grad_norm": 0.2051512748003006, |
|
"learning_rate": 1.4199469277238143e-05, |
|
"loss": 0.2614, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.6971604475403042, |
|
"eval_loss": 0.21043357253074646, |
|
"eval_runtime": 0.1874, |
|
"eval_samples_per_second": 26.678, |
|
"eval_steps_per_second": 5.336, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.7070268563648203, |
|
"grad_norm": 0.21571016311645508, |
|
"learning_rate": 1.330968238177368e-05, |
|
"loss": 0.2563, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 1.7070268563648203, |
|
"eval_loss": 0.21016483008861542, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.739, |
|
"eval_steps_per_second": 5.348, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 1.7168932651893365, |
|
"grad_norm": 0.2853296399116516, |
|
"learning_rate": 1.2446693787500697e-05, |
|
"loss": 0.2589, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.7168932651893365, |
|
"eval_loss": 0.21085257828235626, |
|
"eval_runtime": 0.1856, |
|
"eval_samples_per_second": 26.945, |
|
"eval_steps_per_second": 5.389, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.7267596740138524, |
|
"grad_norm": 0.19415363669395447, |
|
"learning_rate": 1.1610770267328852e-05, |
|
"loss": 0.2613, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 1.7267596740138524, |
|
"eval_loss": 0.21094462275505066, |
|
"eval_runtime": 0.1866, |
|
"eval_samples_per_second": 26.798, |
|
"eval_steps_per_second": 5.36, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 1.7366260828383684, |
|
"grad_norm": 0.20800048112869263, |
|
"learning_rate": 1.0802170227627873e-05, |
|
"loss": 0.2583, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.7366260828383684, |
|
"eval_loss": 0.21065060794353485, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.739, |
|
"eval_steps_per_second": 5.348, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.7464924916628846, |
|
"grad_norm": 0.21770550310611725, |
|
"learning_rate": 1.0021143628347196e-05, |
|
"loss": 0.2634, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 1.7464924916628846, |
|
"eval_loss": 0.2107730358839035, |
|
"eval_runtime": 0.1872, |
|
"eval_samples_per_second": 26.713, |
|
"eval_steps_per_second": 5.343, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 1.7563589004874007, |
|
"grad_norm": 0.20711322128772736, |
|
"learning_rate": 9.26793190574664e-06, |
|
"loss": 0.2586, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.7563589004874007, |
|
"eval_loss": 0.21060581505298615, |
|
"eval_runtime": 0.1873, |
|
"eval_samples_per_second": 26.696, |
|
"eval_steps_per_second": 5.339, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.7662253093119167, |
|
"grad_norm": 0.1906907856464386, |
|
"learning_rate": 8.542767897761917e-06, |
|
"loss": 0.2579, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 1.7662253093119167, |
|
"eval_loss": 0.21017535030841827, |
|
"eval_runtime": 0.1865, |
|
"eval_samples_per_second": 26.808, |
|
"eval_steps_per_second": 5.362, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 1.7760917181364326, |
|
"grad_norm": 0.23783724009990692, |
|
"learning_rate": 7.845875772028289e-06, |
|
"loss": 0.257, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.7760917181364326, |
|
"eval_loss": 0.21030497550964355, |
|
"eval_runtime": 0.1866, |
|
"eval_samples_per_second": 26.796, |
|
"eval_steps_per_second": 5.359, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.7859581269609488, |
|
"grad_norm": 0.20180200040340424, |
|
"learning_rate": 7.177470956584253e-06, |
|
"loss": 0.2577, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 1.7859581269609488, |
|
"eval_loss": 0.21059055626392365, |
|
"eval_runtime": 3427.3667, |
|
"eval_samples_per_second": 0.001, |
|
"eval_steps_per_second": 0.0, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 1.795824535785465, |
|
"grad_norm": 0.19345524907112122, |
|
"learning_rate": 6.537760073277066e-06, |
|
"loss": 0.2569, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.795824535785465, |
|
"eval_loss": 0.21125975251197815, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.737, |
|
"eval_steps_per_second": 5.347, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.8056909446099807, |
|
"grad_norm": 0.20590749382972717, |
|
"learning_rate": 5.926940873890263e-06, |
|
"loss": 0.2575, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 1.8056909446099807, |
|
"eval_loss": 0.21117892861366272, |
|
"eval_runtime": 0.1868, |
|
"eval_samples_per_second": 26.762, |
|
"eval_steps_per_second": 5.352, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 1.8155573534344969, |
|
"grad_norm": 0.22785334289073944, |
|
"learning_rate": 5.345202179013353e-06, |
|
"loss": 0.2593, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.8155573534344969, |
|
"eval_loss": 0.2109127938747406, |
|
"eval_runtime": 0.1867, |
|
"eval_samples_per_second": 26.785, |
|
"eval_steps_per_second": 5.357, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.825423762259013, |
|
"grad_norm": 0.20013581216335297, |
|
"learning_rate": 4.792723819672351e-06, |
|
"loss": 0.2564, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 1.825423762259013, |
|
"eval_loss": 0.21093535423278809, |
|
"eval_runtime": 0.1869, |
|
"eval_samples_per_second": 26.753, |
|
"eval_steps_per_second": 5.351, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 1.835290171083529, |
|
"grad_norm": 0.18782545626163483, |
|
"learning_rate": 4.269676581739079e-06, |
|
"loss": 0.2592, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.835290171083529, |
|
"eval_loss": 0.21116788685321808, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.734, |
|
"eval_steps_per_second": 5.347, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.845156579908045, |
|
"grad_norm": 0.284829318523407, |
|
"learning_rate": 3.776222153136788e-06, |
|
"loss": 0.2616, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 1.845156579908045, |
|
"eval_loss": 0.2108537256717682, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.739, |
|
"eval_steps_per_second": 5.348, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 1.855022988732561, |
|
"grad_norm": 0.21778038144111633, |
|
"learning_rate": 3.3125130738579922e-06, |
|
"loss": 0.2585, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.855022988732561, |
|
"eval_loss": 0.21062783896923065, |
|
"eval_runtime": 0.1869, |
|
"eval_samples_per_second": 26.748, |
|
"eval_steps_per_second": 5.35, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.8648893975570773, |
|
"grad_norm": 0.22620098292827606, |
|
"learning_rate": 2.878692688810314e-06, |
|
"loss": 0.2578, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 1.8648893975570773, |
|
"eval_loss": 0.21043607592582703, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 26.734, |
|
"eval_steps_per_second": 5.347, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 1.8747558063815932, |
|
"grad_norm": 0.2136729210615158, |
|
"learning_rate": 2.4748951035047596e-06, |
|
"loss": 0.2605, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.8747558063815932, |
|
"eval_loss": 0.21055133640766144, |
|
"eval_runtime": 0.1877, |
|
"eval_samples_per_second": 26.643, |
|
"eval_steps_per_second": 5.329, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.8846222152061092, |
|
"grad_norm": 0.19579312205314636, |
|
"learning_rate": 2.101245142600039e-06, |
|
"loss": 0.2592, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 1.8846222152061092, |
|
"eval_loss": 0.21009139716625214, |
|
"eval_runtime": 0.1864, |
|
"eval_samples_per_second": 26.826, |
|
"eval_steps_per_second": 5.365, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 1.8944886240306253, |
|
"grad_norm": 0.2088346630334854, |
|
"learning_rate": 1.7578583113159962e-06, |
|
"loss": 0.2575, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.8944886240306253, |
|
"eval_loss": 0.21053215861320496, |
|
"eval_runtime": 0.1864, |
|
"eval_samples_per_second": 26.826, |
|
"eval_steps_per_second": 5.365, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.9043550328551415, |
|
"grad_norm": 0.23137839138507843, |
|
"learning_rate": 1.4448407597277392e-06, |
|
"loss": 0.2606, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 1.9043550328551415, |
|
"eval_loss": 0.21052584052085876, |
|
"eval_runtime": 0.1872, |
|
"eval_samples_per_second": 26.706, |
|
"eval_steps_per_second": 5.341, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 1.9142214416796575, |
|
"grad_norm": 0.21073339879512787, |
|
"learning_rate": 1.1622892499519421e-06, |
|
"loss": 0.2558, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.9142214416796575, |
|
"eval_loss": 0.21001854538917542, |
|
"eval_runtime": 0.1866, |
|
"eval_samples_per_second": 26.791, |
|
"eval_steps_per_second": 5.358, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.9240878505041734, |
|
"grad_norm": 0.20351086556911469, |
|
"learning_rate": 9.102911262349856e-07, |
|
"loss": 0.2534, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 1.9240878505041734, |
|
"eval_loss": 0.20996317267417908, |
|
"eval_runtime": 0.182, |
|
"eval_samples_per_second": 27.467, |
|
"eval_steps_per_second": 5.493, |
|
"step": 9750 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 10134, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0792506223580856e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|