|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 371940, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 9.375e-06, |
|
"loss": 6.8593, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 1.875e-05, |
|
"loss": 5.3643, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 2.8125e-05, |
|
"loss": 5.0364, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 3.75e-05, |
|
"loss": 4.8218, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 4.6874999999999994e-05, |
|
"loss": 4.6436, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 5.625e-05, |
|
"loss": 4.5074, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 6.5625e-05, |
|
"loss": 4.3958, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 7.5e-05, |
|
"loss": 4.2967, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 8.437499999999999e-05, |
|
"loss": 4.2068, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 9.374999999999999e-05, |
|
"loss": 4.1332, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.000103115625, |
|
"loss": 4.0676, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.000112490625, |
|
"loss": 4.0093, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.000121865625, |
|
"loss": 3.9526, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.00013123125, |
|
"loss": 3.8934, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 0.00014060625, |
|
"loss": 3.8436, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.000149971875, |
|
"loss": 3.8065, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 0.000159346875, |
|
"loss": 3.7709, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.00016871249999999996, |
|
"loss": 3.7368, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.34518001634673895, |
|
"eval_loss": 3.9162497520446777, |
|
"eval_runtime": 152.9029, |
|
"eval_samples_per_second": 378.789, |
|
"eval_steps_per_second": 5.919, |
|
"step": 18597 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 0.000178078125, |
|
"loss": 3.6974, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.00018745312499999998, |
|
"loss": 3.6666, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.00019682812499999998, |
|
"loss": 3.6461, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.00020619374999999998, |
|
"loss": 3.6223, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 0.00021556874999999998, |
|
"loss": 3.6097, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 0.00022494374999999998, |
|
"loss": 3.5886, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.00023430937499999997, |
|
"loss": 3.5771, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 0.00024368437499999997, |
|
"loss": 3.5614, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 0.00025305, |
|
"loss": 3.5469, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 0.000262425, |
|
"loss": 3.5355, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 0.0002718, |
|
"loss": 3.5275, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.000281165625, |
|
"loss": 3.5145, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 0.00029053124999999994, |
|
"loss": 3.5052, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 0.00029990624999999993, |
|
"loss": 3.4957, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 0.000299126316408778, |
|
"loss": 3.4811, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 0.000298243807730776, |
|
"loss": 3.4709, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 0.000297361299052774, |
|
"loss": 3.4514, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 0.00029647967288344996, |
|
"loss": 3.4411, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.000295597164205448, |
|
"loss": 3.4348, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.37482168201575206, |
|
"eval_loss": 3.6327266693115234, |
|
"eval_runtime": 153.9354, |
|
"eval_samples_per_second": 376.249, |
|
"eval_steps_per_second": 5.879, |
|
"step": 37194 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 0.00029471553803612404, |
|
"loss": 3.3847, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.000293833029358122, |
|
"loss": 3.3766, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 0.000292951403188798, |
|
"loss": 3.3701, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 0.000292069777019474, |
|
"loss": 3.3642, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 0.000291187268341472, |
|
"loss": 3.3595, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.00029030475966347, |
|
"loss": 3.3568, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.000289422250985468, |
|
"loss": 3.3475, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"learning_rate": 0.00028854062481614397, |
|
"loss": 3.3403, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 0.000287658116138142, |
|
"loss": 3.3384, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 0.00028677560746014, |
|
"loss": 3.3311, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 0.000285893981290816, |
|
"loss": 3.3236, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"learning_rate": 0.00028501235512149196, |
|
"loss": 3.3185, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 0.00028412984644349, |
|
"loss": 3.3144, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 0.00028324822027416604, |
|
"loss": 3.3129, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 0.000282365711596164, |
|
"loss": 3.3052, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"learning_rate": 0.000281483202918162, |
|
"loss": 3.2975, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 0.000280601576748838, |
|
"loss": 3.2951, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 0.000279719068070836, |
|
"loss": 3.2919, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.3899740583191344, |
|
"eval_loss": 3.508404493331909, |
|
"eval_runtime": 154.0998, |
|
"eval_samples_per_second": 375.847, |
|
"eval_steps_per_second": 5.873, |
|
"step": 55791 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 0.000278836559392834, |
|
"loss": 3.2736, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"learning_rate": 0.000277954050714832, |
|
"loss": 3.2226, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 0.00027707154203683, |
|
"loss": 3.2278, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.000276189915867506, |
|
"loss": 3.2315, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 0.000275307407189504, |
|
"loss": 3.2244, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 0.00027442578102017997, |
|
"loss": 3.229, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 0.000273543272342178, |
|
"loss": 3.231, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 0.00027266076366417604, |
|
"loss": 3.224, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 0.000271779137494852, |
|
"loss": 3.2256, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"learning_rate": 0.00027089662881685, |
|
"loss": 3.2241, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 0.000270015002647526, |
|
"loss": 3.2223, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 0.00026913249396952403, |
|
"loss": 3.2191, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 0.000268249985291522, |
|
"loss": 3.2179, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 0.000267368359122198, |
|
"loss": 3.2133, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 0.00026648585044419603, |
|
"loss": 3.2172, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"learning_rate": 0.000265603341766194, |
|
"loss": 3.2109, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 0.000264720833088192, |
|
"loss": 3.2077, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"learning_rate": 0.00026383832441018997, |
|
"loss": 3.2076, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 0.000262956698240866, |
|
"loss": 3.2086, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3956238680662571, |
|
"eval_loss": 3.4501850605010986, |
|
"eval_runtime": 154.25, |
|
"eval_samples_per_second": 375.481, |
|
"eval_steps_per_second": 5.867, |
|
"step": 74388 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"learning_rate": 0.000262075072071542, |
|
"loss": 3.1628, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"learning_rate": 0.00026119256339354003, |
|
"loss": 3.1437, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"learning_rate": 0.000260310054715538, |
|
"loss": 3.1446, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"learning_rate": 0.000259428428546214, |
|
"loss": 3.1476, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"learning_rate": 0.00025854591986821204, |
|
"loss": 3.1475, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"learning_rate": 0.00025766341119021, |
|
"loss": 3.1509, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"learning_rate": 0.000256780902512208, |
|
"loss": 3.1536, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"learning_rate": 0.00025589839383420603, |
|
"loss": 3.1514, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"learning_rate": 0.000255015885156204, |
|
"loss": 3.149, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"learning_rate": 0.00025413425898688, |
|
"loss": 3.1531, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"learning_rate": 0.000253251750308878, |
|
"loss": 3.1487, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"learning_rate": 0.000252370124139554, |
|
"loss": 3.1467, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"learning_rate": 0.000251487615461552, |
|
"loss": 3.1496, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 0.00025060510678355004, |
|
"loss": 3.1494, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"learning_rate": 0.000249723480614226, |
|
"loss": 3.148, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"learning_rate": 0.000248840971936224, |
|
"loss": 3.1496, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"learning_rate": 0.00024795846325822204, |
|
"loss": 3.1477, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"learning_rate": 0.00024707595458022, |
|
"loss": 3.1474, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.3994938234828981, |
|
"eval_loss": 3.423523426055908, |
|
"eval_runtime": 154.6648, |
|
"eval_samples_per_second": 374.474, |
|
"eval_steps_per_second": 5.851, |
|
"step": 92985 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 0.000246194328410896, |
|
"loss": 3.1438, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"learning_rate": 0.000245311819732894, |
|
"loss": 3.0764, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"learning_rate": 0.000244429311054892, |
|
"loss": 3.0802, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"learning_rate": 0.000243547684885568, |
|
"loss": 3.0875, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"learning_rate": 0.00024266605871624405, |
|
"loss": 3.0881, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"learning_rate": 0.00024178355003824203, |
|
"loss": 3.0951, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"learning_rate": 0.00024090192386891804, |
|
"loss": 3.0977, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"learning_rate": 0.00024001941519091602, |
|
"loss": 3.093, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"learning_rate": 0.00023913778902159204, |
|
"loss": 3.097, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"learning_rate": 0.00023825528034359002, |
|
"loss": 3.0942, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"learning_rate": 0.00023737277166558803, |
|
"loss": 3.0994, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"learning_rate": 0.000236491145496264, |
|
"loss": 3.1007, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"learning_rate": 0.00023560863681826202, |
|
"loss": 3.1006, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.00023472612814026003, |
|
"loss": 3.1013, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"learning_rate": 0.00023384450197093602, |
|
"loss": 3.103, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"learning_rate": 0.00023296199329293402, |
|
"loss": 3.1001, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"learning_rate": 0.00023208036712361004, |
|
"loss": 3.0997, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"learning_rate": 0.00023119785844560804, |
|
"loss": 3.0997, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"learning_rate": 0.00023031623227628403, |
|
"loss": 3.1012, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.4019542992689383, |
|
"eval_loss": 3.4030864238739014, |
|
"eval_runtime": 154.526, |
|
"eval_samples_per_second": 374.811, |
|
"eval_steps_per_second": 5.857, |
|
"step": 111582 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"learning_rate": 0.00022943372359828204, |
|
"loss": 3.0671, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"learning_rate": 0.00022855121492028005, |
|
"loss": 3.0316, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"learning_rate": 0.00022766870624227803, |
|
"loss": 3.0381, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"learning_rate": 0.00022678619756427604, |
|
"loss": 3.0465, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"learning_rate": 0.00022590368888627402, |
|
"loss": 3.0474, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"learning_rate": 0.00022502294522562804, |
|
"loss": 3.0522, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"learning_rate": 0.00022414043654762602, |
|
"loss": 3.0489, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.00022325881037830203, |
|
"loss": 3.0538, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"learning_rate": 0.0002223763017003, |
|
"loss": 3.0525, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"learning_rate": 0.00022149379302229805, |
|
"loss": 3.0535, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"learning_rate": 0.00022061128434429605, |
|
"loss": 3.0572, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"learning_rate": 0.00021972877566629403, |
|
"loss": 3.06, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"learning_rate": 0.00021884714949697005, |
|
"loss": 3.0585, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"learning_rate": 0.00021796552332764603, |
|
"loss": 3.0623, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"learning_rate": 0.00021708301464964404, |
|
"loss": 3.0628, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"learning_rate": 0.00021620138848032003, |
|
"loss": 3.0624, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"learning_rate": 0.00021531887980231804, |
|
"loss": 3.0592, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"learning_rate": 0.00021443637112431604, |
|
"loss": 3.0606, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"learning_rate": 0.00021355386244631402, |
|
"loss": 3.0638, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.4030296153303013, |
|
"eval_loss": 3.4128024578094482, |
|
"eval_runtime": 154.0025, |
|
"eval_samples_per_second": 376.085, |
|
"eval_steps_per_second": 5.877, |
|
"step": 130179 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"learning_rate": 0.00021267311878566802, |
|
"loss": 3.0018, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"learning_rate": 0.00021179061010766602, |
|
"loss": 2.9983, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"learning_rate": 0.00021090810142966406, |
|
"loss": 3.0046, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"learning_rate": 0.00021002559275166204, |
|
"loss": 3.0095, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"learning_rate": 0.00020914396658233805, |
|
"loss": 3.0113, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"learning_rate": 0.00020826145790433606, |
|
"loss": 3.0139, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"learning_rate": 0.00020737894922633404, |
|
"loss": 3.0141, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"learning_rate": 0.00020649732305701006, |
|
"loss": 3.0165, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"learning_rate": 0.00020561481437900804, |
|
"loss": 3.0232, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"learning_rate": 0.00020473230570100604, |
|
"loss": 3.0236, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"learning_rate": 0.00020384979702300403, |
|
"loss": 3.0217, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"learning_rate": 0.00020296817085368004, |
|
"loss": 3.0193, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"learning_rate": 0.00020208654468435603, |
|
"loss": 3.0279, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"learning_rate": 0.00020120403600635403, |
|
"loss": 3.0243, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"learning_rate": 0.00020032152732835207, |
|
"loss": 3.0241, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"learning_rate": 0.00019943901865035005, |
|
"loss": 3.0299, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"learning_rate": 0.00019855650997234806, |
|
"loss": 3.0313, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"learning_rate": 0.00019767400129434606, |
|
"loss": 3.0262, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4046377523139853, |
|
"eval_loss": 3.3997907638549805, |
|
"eval_runtime": 154.087, |
|
"eval_samples_per_second": 375.879, |
|
"eval_steps_per_second": 5.873, |
|
"step": 148776 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"learning_rate": 0.00019679237512502205, |
|
"loss": 3.0108, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"learning_rate": 0.00019590986644702006, |
|
"loss": 2.9607, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"learning_rate": 0.00019502735776901804, |
|
"loss": 2.9699, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"learning_rate": 0.00019414484909101605, |
|
"loss": 2.9736, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"learning_rate": 0.00019326322292169203, |
|
"loss": 2.9743, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"learning_rate": 0.00019238159675236805, |
|
"loss": 2.9832, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"learning_rate": 0.00019149908807436603, |
|
"loss": 2.982, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"learning_rate": 0.00019061657939636404, |
|
"loss": 2.9869, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"learning_rate": 0.00018973495322704008, |
|
"loss": 2.9853, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"learning_rate": 0.00018885244454903806, |
|
"loss": 2.9898, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"learning_rate": 0.00018796993587103607, |
|
"loss": 2.9966, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"learning_rate": 0.00018708742719303405, |
|
"loss": 2.9916, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"learning_rate": 0.00018620580102371006, |
|
"loss": 2.9973, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"learning_rate": 0.00018532329234570804, |
|
"loss": 2.9947, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"learning_rate": 0.00018444166617638405, |
|
"loss": 2.9926, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"learning_rate": 0.00018356004000706004, |
|
"loss": 2.9954, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"learning_rate": 0.00018267753132905805, |
|
"loss": 2.9968, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"learning_rate": 0.00018179502265105606, |
|
"loss": 2.9997, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"learning_rate": 0.00018091251397305404, |
|
"loss": 3.0016, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.40699604418590996, |
|
"eval_loss": 3.373080015182495, |
|
"eval_runtime": 153.8917, |
|
"eval_samples_per_second": 376.355, |
|
"eval_steps_per_second": 5.881, |
|
"step": 167373 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"learning_rate": 0.00018003088780373005, |
|
"loss": 2.9567, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"learning_rate": 0.00017914926163440606, |
|
"loss": 2.9362, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"learning_rate": 0.00017826675295640407, |
|
"loss": 2.9438, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"learning_rate": 0.00017738424427840205, |
|
"loss": 2.9495, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"learning_rate": 0.00017650261810907807, |
|
"loss": 2.9516, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"learning_rate": 0.00017562010943107607, |
|
"loss": 2.9533, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"learning_rate": 0.00017473760075307405, |
|
"loss": 2.9566, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"learning_rate": 0.00017385509207507206, |
|
"loss": 2.9633, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"learning_rate": 0.00017297258339707004, |
|
"loss": 2.9596, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"learning_rate": 0.00017209095722774606, |
|
"loss": 2.9628, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"learning_rate": 0.00017120844854974404, |
|
"loss": 2.9617, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"learning_rate": 0.00017032770488909806, |
|
"loss": 2.9647, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"learning_rate": 0.00016944519621109604, |
|
"loss": 2.964, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"learning_rate": 0.00016856268753309407, |
|
"loss": 2.9676, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"learning_rate": 0.00016768017885509208, |
|
"loss": 2.9731, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"learning_rate": 0.00016679767017709006, |
|
"loss": 2.9734, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"learning_rate": 0.00016591604400776607, |
|
"loss": 2.9745, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"learning_rate": 0.00016503353532976406, |
|
"loss": 2.9715, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.40619785412436715, |
|
"eval_loss": 3.4057776927948, |
|
"eval_runtime": 153.2918, |
|
"eval_samples_per_second": 377.828, |
|
"eval_steps_per_second": 5.904, |
|
"step": 185970 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 0.00016415190916044007, |
|
"loss": 2.97, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"learning_rate": 0.00016326940048243805, |
|
"loss": 2.9101, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"learning_rate": 0.00016238777431311406, |
|
"loss": 2.9121, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"learning_rate": 0.00016150703065246806, |
|
"loss": 2.9209, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"learning_rate": 0.00016062452197446606, |
|
"loss": 2.922, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"learning_rate": 0.00015974201329646404, |
|
"loss": 2.9243, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"learning_rate": 0.00015885950461846205, |
|
"loss": 2.9304, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"learning_rate": 0.0001579769959404601, |
|
"loss": 2.9305, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"learning_rate": 0.00015709448726245807, |
|
"loss": 2.9363, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.49, |
|
"learning_rate": 0.00015621286109313408, |
|
"loss": 2.9379, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"learning_rate": 0.0001553303524151321, |
|
"loss": 2.941, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.59, |
|
"learning_rate": 0.00015444872624580808, |
|
"loss": 2.9366, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"learning_rate": 0.00015356621756780608, |
|
"loss": 2.9413, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"learning_rate": 0.00015268370888980406, |
|
"loss": 2.9473, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"learning_rate": 0.00015180120021180207, |
|
"loss": 2.9472, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"learning_rate": 0.00015091957404247806, |
|
"loss": 2.9504, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"learning_rate": 0.00015003706536447607, |
|
"loss": 2.9458, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"learning_rate": 0.00014915455668647407, |
|
"loss": 2.9471, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"learning_rate": 0.00014827293051715006, |
|
"loss": 2.9481, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.4068837157806495, |
|
"eval_loss": 3.38750958442688, |
|
"eval_runtime": 153.5658, |
|
"eval_samples_per_second": 377.154, |
|
"eval_steps_per_second": 5.893, |
|
"step": 204567 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"learning_rate": 0.00014739042183914807, |
|
"loss": 2.9156, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"learning_rate": 0.00014650879566982408, |
|
"loss": 2.8891, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"learning_rate": 0.00014562628699182206, |
|
"loss": 2.8942, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.18, |
|
"learning_rate": 0.00014474554333117608, |
|
"loss": 2.9003, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"learning_rate": 0.0001438630346531741, |
|
"loss": 2.8988, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.29, |
|
"learning_rate": 0.00014298052597517207, |
|
"loss": 2.9105, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.35, |
|
"learning_rate": 0.00014209889980584808, |
|
"loss": 2.9077, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"learning_rate": 0.00014121639112784606, |
|
"loss": 2.9168, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.45, |
|
"learning_rate": 0.00014033388244984407, |
|
"loss": 2.9148, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"learning_rate": 0.00013945137377184208, |
|
"loss": 2.9142, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"learning_rate": 0.0001385697476025181, |
|
"loss": 2.9159, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.61, |
|
"learning_rate": 0.00013768723892451607, |
|
"loss": 2.9195, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"learning_rate": 0.00013680473024651408, |
|
"loss": 2.9189, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"learning_rate": 0.00013592398658586807, |
|
"loss": 2.9207, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.78, |
|
"learning_rate": 0.00013504147790786608, |
|
"loss": 2.9228, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"learning_rate": 0.0001341589692298641, |
|
"loss": 2.9244, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"learning_rate": 0.00013327734306054008, |
|
"loss": 2.9194, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.94, |
|
"learning_rate": 0.00013239483438253808, |
|
"loss": 2.9257, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.99, |
|
"learning_rate": 0.0001315123257045361, |
|
"loss": 2.9243, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.407012167880445, |
|
"eval_loss": 3.4070396423339844, |
|
"eval_runtime": 153.2437, |
|
"eval_samples_per_second": 377.947, |
|
"eval_steps_per_second": 5.906, |
|
"step": 223164 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"learning_rate": 0.00013063069953521208, |
|
"loss": 2.874, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"learning_rate": 0.0001297481908572101, |
|
"loss": 2.8707, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.15, |
|
"learning_rate": 0.0001288656821792081, |
|
"loss": 2.8776, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.21, |
|
"learning_rate": 0.00012798317350120608, |
|
"loss": 2.876, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.26, |
|
"learning_rate": 0.0001271015473318821, |
|
"loss": 2.8805, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.31, |
|
"learning_rate": 0.0001262190386538801, |
|
"loss": 2.8861, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"learning_rate": 0.00012533741248455608, |
|
"loss": 2.8871, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.42, |
|
"learning_rate": 0.0001244549038065541, |
|
"loss": 2.8902, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"learning_rate": 0.0001235723951285521, |
|
"loss": 2.8965, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"learning_rate": 0.00012269076895922809, |
|
"loss": 2.8889, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.58, |
|
"learning_rate": 0.0001218082602812261, |
|
"loss": 2.9003, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"learning_rate": 0.00012092575160322409, |
|
"loss": 2.896, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.69, |
|
"learning_rate": 0.00012004324292522208, |
|
"loss": 2.8995, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.74, |
|
"learning_rate": 0.00011916161675589808, |
|
"loss": 2.8987, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"learning_rate": 0.00011827910807789609, |
|
"loss": 2.8998, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"learning_rate": 0.0001173965993998941, |
|
"loss": 2.9073, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"learning_rate": 0.0001165149732305701, |
|
"loss": 2.9013, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"learning_rate": 0.00011563246455256809, |
|
"loss": 2.9047, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.4079228207113693, |
|
"eval_loss": 3.4015119075775146, |
|
"eval_runtime": 153.5554, |
|
"eval_samples_per_second": 377.18, |
|
"eval_steps_per_second": 5.894, |
|
"step": 241761 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"learning_rate": 0.00011475083838324409, |
|
"loss": 2.8895, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.07, |
|
"learning_rate": 0.00011386832970524209, |
|
"loss": 2.8458, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"learning_rate": 0.0001129867035359181, |
|
"loss": 2.8516, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.17, |
|
"learning_rate": 0.0001121041948579161, |
|
"loss": 2.8585, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.23, |
|
"learning_rate": 0.0001112225686885921, |
|
"loss": 2.8576, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"learning_rate": 0.00011034006001059009, |
|
"loss": 2.8645, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.34, |
|
"learning_rate": 0.00010945843384126609, |
|
"loss": 2.87, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"learning_rate": 0.00010857592516326408, |
|
"loss": 2.8666, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"learning_rate": 0.0001076934164852621, |
|
"loss": 2.872, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"learning_rate": 0.0001068109078072601, |
|
"loss": 2.8725, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"learning_rate": 0.0001059292816379361, |
|
"loss": 2.8739, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"learning_rate": 0.00010504677295993409, |
|
"loss": 2.878, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.66, |
|
"learning_rate": 0.00010416514679061009, |
|
"loss": 2.8767, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"learning_rate": 0.00010328263811260809, |
|
"loss": 2.877, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.77, |
|
"learning_rate": 0.00010240012943460611, |
|
"loss": 2.882, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"learning_rate": 0.00010151850326528211, |
|
"loss": 2.8804, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.87, |
|
"learning_rate": 0.0001006368770959581, |
|
"loss": 2.8858, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.93, |
|
"learning_rate": 9.97543684179561e-05, |
|
"loss": 2.882, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"learning_rate": 9.88718597399541e-05, |
|
"loss": 2.8797, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.40766477441674887, |
|
"eval_loss": 3.411367177963257, |
|
"eval_runtime": 153.8972, |
|
"eval_samples_per_second": 376.342, |
|
"eval_steps_per_second": 5.881, |
|
"step": 260358 |
|
}, |
|
{ |
|
"epoch": 14.03, |
|
"learning_rate": 9.79902335706301e-05, |
|
"loss": 2.8484, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.09, |
|
"learning_rate": 9.71077248926281e-05, |
|
"loss": 2.8351, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.14, |
|
"learning_rate": 9.62252162146261e-05, |
|
"loss": 2.8377, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"learning_rate": 9.534270753662411e-05, |
|
"loss": 2.8434, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"learning_rate": 9.446108136730011e-05, |
|
"loss": 2.841, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.3, |
|
"learning_rate": 9.35785726892981e-05, |
|
"loss": 2.844, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.36, |
|
"learning_rate": 9.26960640112961e-05, |
|
"loss": 2.8465, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.41, |
|
"learning_rate": 9.18135553332941e-05, |
|
"loss": 2.8546, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.46, |
|
"learning_rate": 9.09319291639701e-05, |
|
"loss": 2.8524, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.52, |
|
"learning_rate": 9.00503029946461e-05, |
|
"loss": 2.8515, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.57, |
|
"learning_rate": 8.91677943166441e-05, |
|
"loss": 2.8541, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.63, |
|
"learning_rate": 8.82861681473201e-05, |
|
"loss": 2.8572, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"learning_rate": 8.74036594693181e-05, |
|
"loss": 2.8568, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.73, |
|
"learning_rate": 8.652115079131611e-05, |
|
"loss": 2.8618, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.79, |
|
"learning_rate": 8.563952462199211e-05, |
|
"loss": 2.8609, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.84, |
|
"learning_rate": 8.475701594399011e-05, |
|
"loss": 2.8582, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.89, |
|
"learning_rate": 8.38745072659881e-05, |
|
"loss": 2.8602, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.95, |
|
"learning_rate": 8.29919985879861e-05, |
|
"loss": 2.8651, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.40826464303551124, |
|
"eval_loss": 3.4071929454803467, |
|
"eval_runtime": 153.9494, |
|
"eval_samples_per_second": 376.215, |
|
"eval_steps_per_second": 5.879, |
|
"step": 278955 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"learning_rate": 8.21103724186621e-05, |
|
"loss": 2.8635, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.06, |
|
"learning_rate": 8.122786374066012e-05, |
|
"loss": 2.8126, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.11, |
|
"learning_rate": 8.034535506265811e-05, |
|
"loss": 2.8181, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.16, |
|
"learning_rate": 7.946372889333411e-05, |
|
"loss": 2.82, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.22, |
|
"learning_rate": 7.85812202153321e-05, |
|
"loss": 2.826, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"learning_rate": 7.76995940460081e-05, |
|
"loss": 2.8282, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.33, |
|
"learning_rate": 7.681796787668412e-05, |
|
"loss": 2.8297, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"learning_rate": 7.593545919868211e-05, |
|
"loss": 2.8314, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.43, |
|
"learning_rate": 7.505383302935811e-05, |
|
"loss": 2.8329, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.49, |
|
"learning_rate": 7.417132435135611e-05, |
|
"loss": 2.8348, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.54, |
|
"learning_rate": 7.328881567335412e-05, |
|
"loss": 2.8374, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.59, |
|
"learning_rate": 7.240630699535211e-05, |
|
"loss": 2.8359, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.65, |
|
"learning_rate": 7.152468082602811e-05, |
|
"loss": 2.8397, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.7, |
|
"learning_rate": 7.064217214802612e-05, |
|
"loss": 2.8381, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.76, |
|
"learning_rate": 6.975966347002411e-05, |
|
"loss": 2.8392, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.81, |
|
"learning_rate": 6.887803730070011e-05, |
|
"loss": 2.8425, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.86, |
|
"learning_rate": 6.799552862269812e-05, |
|
"loss": 2.842, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.92, |
|
"learning_rate": 6.711301994469612e-05, |
|
"loss": 2.8431, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.97, |
|
"learning_rate": 6.623051126669412e-05, |
|
"loss": 2.8434, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.40750031273249193, |
|
"eval_loss": 3.424048900604248, |
|
"eval_runtime": 153.7747, |
|
"eval_samples_per_second": 376.642, |
|
"eval_steps_per_second": 5.885, |
|
"step": 297552 |
|
}, |
|
{ |
|
"epoch": 16.02, |
|
"learning_rate": 6.534888509737011e-05, |
|
"loss": 2.8248, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.08, |
|
"learning_rate": 6.446637641936812e-05, |
|
"loss": 2.7992, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.13, |
|
"learning_rate": 6.358386774136611e-05, |
|
"loss": 2.808, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.19, |
|
"learning_rate": 6.270135906336412e-05, |
|
"loss": 2.807, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.24, |
|
"learning_rate": 6.181973289404012e-05, |
|
"loss": 2.8109, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.29, |
|
"learning_rate": 6.093722421603812e-05, |
|
"loss": 2.8098, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.35, |
|
"learning_rate": 6.005648055539212e-05, |
|
"loss": 2.811, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"learning_rate": 5.917397187739013e-05, |
|
"loss": 2.8134, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.45, |
|
"learning_rate": 5.829146319938812e-05, |
|
"loss": 2.8165, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.51, |
|
"learning_rate": 5.740983703006412e-05, |
|
"loss": 2.8168, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.56, |
|
"learning_rate": 5.652732835206213e-05, |
|
"loss": 2.8195, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.62, |
|
"learning_rate": 5.5644819674060124e-05, |
|
"loss": 2.8171, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.67, |
|
"learning_rate": 5.4763193504736124e-05, |
|
"loss": 2.8199, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.72, |
|
"learning_rate": 5.3880684826734125e-05, |
|
"loss": 2.8202, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.78, |
|
"learning_rate": 5.2998176148732126e-05, |
|
"loss": 2.8227, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.83, |
|
"learning_rate": 5.211566747073012e-05, |
|
"loss": 2.826, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.88, |
|
"learning_rate": 5.123404130140613e-05, |
|
"loss": 2.8246, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.94, |
|
"learning_rate": 5.035153262340413e-05, |
|
"loss": 2.824, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 16.99, |
|
"learning_rate": 4.946902394540212e-05, |
|
"loss": 2.8255, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.40828197600713634, |
|
"eval_loss": 3.417881727218628, |
|
"eval_runtime": 153.4343, |
|
"eval_samples_per_second": 377.478, |
|
"eval_steps_per_second": 5.898, |
|
"step": 316149 |
|
}, |
|
{ |
|
"epoch": 17.05, |
|
"learning_rate": 4.858739777607813e-05, |
|
"loss": 2.7937, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.1, |
|
"learning_rate": 4.770488909807612e-05, |
|
"loss": 2.7869, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.15, |
|
"learning_rate": 4.682326292875212e-05, |
|
"loss": 2.792, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.21, |
|
"learning_rate": 4.594163675942813e-05, |
|
"loss": 2.7937, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.26, |
|
"learning_rate": 4.505912808142613e-05, |
|
"loss": 2.7897, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.31, |
|
"learning_rate": 4.4176619403424126e-05, |
|
"loss": 2.7967, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.37, |
|
"learning_rate": 4.329411072542213e-05, |
|
"loss": 2.7942, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.42, |
|
"learning_rate": 4.241160204742013e-05, |
|
"loss": 2.8025, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.48, |
|
"learning_rate": 4.152909336941813e-05, |
|
"loss": 2.8014, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"learning_rate": 4.0648349708772134e-05, |
|
"loss": 2.8022, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.58, |
|
"learning_rate": 3.976584103077013e-05, |
|
"loss": 2.8024, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.64, |
|
"learning_rate": 3.888333235276813e-05, |
|
"loss": 2.8028, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.69, |
|
"learning_rate": 3.800082367476614e-05, |
|
"loss": 2.8021, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.74, |
|
"learning_rate": 3.711831499676413e-05, |
|
"loss": 2.8048, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"learning_rate": 3.623668882744013e-05, |
|
"loss": 2.8058, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.85, |
|
"learning_rate": 3.535506265811613e-05, |
|
"loss": 2.8078, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.91, |
|
"learning_rate": 3.447255398011413e-05, |
|
"loss": 2.8043, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.96, |
|
"learning_rate": 3.3590045302112134e-05, |
|
"loss": 2.8036, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.4081968563364037, |
|
"eval_loss": 3.4256491661071777, |
|
"eval_runtime": 153.5909, |
|
"eval_samples_per_second": 377.093, |
|
"eval_steps_per_second": 5.892, |
|
"step": 334746 |
|
}, |
|
{ |
|
"epoch": 18.01, |
|
"learning_rate": 3.2708419132788134e-05, |
|
"loss": 2.7978, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.07, |
|
"learning_rate": 3.1825910454786135e-05, |
|
"loss": 2.7744, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.12, |
|
"learning_rate": 3.0943401776784136e-05, |
|
"loss": 2.7766, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.17, |
|
"learning_rate": 3.0060893098782134e-05, |
|
"loss": 2.7851, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.23, |
|
"learning_rate": 2.9179266929458137e-05, |
|
"loss": 2.7799, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.28, |
|
"learning_rate": 2.829675825145614e-05, |
|
"loss": 2.7785, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.34, |
|
"learning_rate": 2.7415132082132135e-05, |
|
"loss": 2.7824, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.39, |
|
"learning_rate": 2.6532623404130136e-05, |
|
"loss": 2.7789, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.44, |
|
"learning_rate": 2.565011472612814e-05, |
|
"loss": 2.7847, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"learning_rate": 2.4767606048126135e-05, |
|
"loss": 2.7849, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.55, |
|
"learning_rate": 2.3885979878802138e-05, |
|
"loss": 2.7844, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.61, |
|
"learning_rate": 2.300435370947814e-05, |
|
"loss": 2.7845, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.66, |
|
"learning_rate": 2.212184503147614e-05, |
|
"loss": 2.7858, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.71, |
|
"learning_rate": 2.123933635347414e-05, |
|
"loss": 2.788, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.77, |
|
"learning_rate": 2.035682767547214e-05, |
|
"loss": 2.7841, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.82, |
|
"learning_rate": 1.947431899747014e-05, |
|
"loss": 2.7931, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.87, |
|
"learning_rate": 1.859357533682414e-05, |
|
"loss": 2.7847, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.93, |
|
"learning_rate": 1.7711066658822143e-05, |
|
"loss": 2.7911, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.98, |
|
"learning_rate": 1.6828557980820144e-05, |
|
"loss": 2.7888, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.4083161044939021, |
|
"eval_loss": 3.4363324642181396, |
|
"eval_runtime": 153.9158, |
|
"eval_samples_per_second": 376.297, |
|
"eval_steps_per_second": 5.88, |
|
"step": 353343 |
|
}, |
|
{ |
|
"epoch": 19.04, |
|
"learning_rate": 1.5946931811496144e-05, |
|
"loss": 2.7724, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.09, |
|
"learning_rate": 1.5064423133494145e-05, |
|
"loss": 2.7653, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.14, |
|
"learning_rate": 1.4181914455492144e-05, |
|
"loss": 2.7712, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"learning_rate": 1.3299405777490143e-05, |
|
"loss": 2.7696, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"learning_rate": 1.2416897099488144e-05, |
|
"loss": 2.7658, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.3, |
|
"learning_rate": 1.1535270930164146e-05, |
|
"loss": 2.7637, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.36, |
|
"learning_rate": 1.0653644760840146e-05, |
|
"loss": 2.769, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.41, |
|
"learning_rate": 9.771136082838147e-06, |
|
"loss": 2.7734, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.47, |
|
"learning_rate": 8.888627404836147e-06, |
|
"loss": 2.7685, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.52, |
|
"learning_rate": 8.006118726834148e-06, |
|
"loss": 2.7733, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.57, |
|
"learning_rate": 7.125375066188151e-06, |
|
"loss": 2.7755, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.63, |
|
"learning_rate": 6.2428663881861495e-06, |
|
"loss": 2.7693, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.68, |
|
"learning_rate": 5.36035771018415e-06, |
|
"loss": 2.7692, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.73, |
|
"learning_rate": 4.47784903218215e-06, |
|
"loss": 2.7709, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.79, |
|
"learning_rate": 3.5953403541801493e-06, |
|
"loss": 2.7716, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.84, |
|
"learning_rate": 2.7128316761781486e-06, |
|
"loss": 2.7701, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.9, |
|
"learning_rate": 1.8312055068541506e-06, |
|
"loss": 2.7714, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.95, |
|
"learning_rate": 9.486968288521502e-07, |
|
"loss": 2.7701, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.40813531756892846, |
|
"eval_loss": 3.4419291019439697, |
|
"eval_runtime": 153.8158, |
|
"eval_samples_per_second": 376.541, |
|
"eval_steps_per_second": 5.884, |
|
"step": 371940 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 371940, |
|
"total_flos": 1.56702845389824e+18, |
|
"train_loss": 3.060115090571354, |
|
"train_runtime": 80942.7407, |
|
"train_samples_per_second": 147.036, |
|
"train_steps_per_second": 4.595 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 371940, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.56702845389824e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|