|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.999927855133107, |
|
"global_step": 55440, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004954906204906205, |
|
"loss": 0.5606, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.000490981240981241, |
|
"loss": 0.4359, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.00048647186147186144, |
|
"loss": 0.3852, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.000481962481962482, |
|
"loss": 0.3778, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.00047745310245310245, |
|
"loss": 0.3523, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.0004729437229437229, |
|
"loss": 0.3319, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.00046843434343434346, |
|
"loss": 0.3153, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 0.00046392496392496394, |
|
"loss": 0.3027, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.0004594155844155844, |
|
"loss": 0.3063, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 0.00045490620490620494, |
|
"loss": 0.292, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 0.0004503968253968254, |
|
"loss": 0.2802, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 0.0004458874458874459, |
|
"loss": 0.2705, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 0.0004413780663780664, |
|
"loss": 0.2669, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"learning_rate": 0.00043686868686868685, |
|
"loss": 0.2616, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.00043235930735930733, |
|
"loss": 0.2486, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 0.00042784992784992786, |
|
"loss": 0.2432, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 0.00042334054834054834, |
|
"loss": 0.2413, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 0.0004188311688311688, |
|
"loss": 0.2479, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"learning_rate": 0.00041432178932178935, |
|
"loss": 0.2344, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"learning_rate": 0.0004098124098124098, |
|
"loss": 0.2363, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"learning_rate": 0.0004053030303030303, |
|
"loss": 0.2328, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 0.00040079365079365083, |
|
"loss": 0.237, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"learning_rate": 0.0003962842712842713, |
|
"loss": 0.2284, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 0.0003917748917748918, |
|
"loss": 0.2286, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"learning_rate": 0.00038726551226551226, |
|
"loss": 0.222, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 0.00038275613275613274, |
|
"loss": 0.2144, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"learning_rate": 0.0003782467532467532, |
|
"loss": 0.2272, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"learning_rate": 0.00037373737373737375, |
|
"loss": 0.2107, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 0.0003692279942279942, |
|
"loss": 0.2054, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"learning_rate": 0.0003647186147186147, |
|
"loss": 0.1953, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 0.00036020923520923523, |
|
"loss": 0.1955, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.0003556998556998557, |
|
"loss": 0.2012, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"learning_rate": 0.0003511904761904762, |
|
"loss": 0.1938, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"learning_rate": 0.0003466810966810967, |
|
"loss": 0.1932, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 0.0003421717171717172, |
|
"loss": 0.1944, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 0.00033766233766233767, |
|
"loss": 0.1967, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"learning_rate": 0.00033315295815295815, |
|
"loss": 0.1955, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 0.0003286435786435786, |
|
"loss": 0.1906, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"learning_rate": 0.0003241341991341991, |
|
"loss": 0.1894, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"learning_rate": 0.00031962481962481964, |
|
"loss": 0.1908, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 0.0003151154401154401, |
|
"loss": 0.1835, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"learning_rate": 0.0003106060606060606, |
|
"loss": 0.1755, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"learning_rate": 0.0003060966810966811, |
|
"loss": 0.1725, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.0003015873015873016, |
|
"loss": 0.172, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"learning_rate": 0.0002970779220779221, |
|
"loss": 0.172, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"learning_rate": 0.0002925685425685426, |
|
"loss": 0.1649, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 0.0002880591630591631, |
|
"loss": 0.1683, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"learning_rate": 0.00028354978354978356, |
|
"loss": 0.1696, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"learning_rate": 0.00027904040404040404, |
|
"loss": 0.1695, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"learning_rate": 0.0002745310245310245, |
|
"loss": 0.1705, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"learning_rate": 0.000270021645021645, |
|
"loss": 0.1659, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"learning_rate": 0.0002655122655122655, |
|
"loss": 0.1694, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"learning_rate": 0.000261002886002886, |
|
"loss": 0.1599, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"learning_rate": 0.0002564935064935065, |
|
"loss": 0.1638, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"learning_rate": 0.000251984126984127, |
|
"loss": 0.1662, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"learning_rate": 0.0002474747474747475, |
|
"loss": 0.1587, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"learning_rate": 0.00024296536796536796, |
|
"loss": 0.1453, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"learning_rate": 0.00023845598845598847, |
|
"loss": 0.1505, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"learning_rate": 0.00023394660894660897, |
|
"loss": 0.1498, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"learning_rate": 0.00022943722943722945, |
|
"loss": 0.145, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"learning_rate": 0.00022492784992784992, |
|
"loss": 0.1446, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"learning_rate": 0.00022041847041847043, |
|
"loss": 0.1425, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"learning_rate": 0.0002159090909090909, |
|
"loss": 0.1465, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"learning_rate": 0.0002113997113997114, |
|
"loss": 0.1501, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"learning_rate": 0.00020689033189033191, |
|
"loss": 0.1488, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"learning_rate": 0.0002023809523809524, |
|
"loss": 0.1506, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"learning_rate": 0.00019787157287157287, |
|
"loss": 0.1457, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"learning_rate": 0.00019336219336219337, |
|
"loss": 0.1436, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"learning_rate": 0.00018885281385281385, |
|
"loss": 0.1447, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"learning_rate": 0.00018434343434343435, |
|
"loss": 0.1381, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"learning_rate": 0.00017983405483405486, |
|
"loss": 0.1297, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"learning_rate": 0.00017532467532467534, |
|
"loss": 0.1284, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"learning_rate": 0.0001708152958152958, |
|
"loss": 0.1375, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"learning_rate": 0.00016630591630591632, |
|
"loss": 0.1326, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"learning_rate": 0.0001617965367965368, |
|
"loss": 0.1351, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"learning_rate": 0.00015728715728715727, |
|
"loss": 0.1338, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 5.56, |
|
"learning_rate": 0.0001527777777777778, |
|
"loss": 0.1303, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 5.63, |
|
"learning_rate": 0.00014826839826839828, |
|
"loss": 0.1311, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.00014375901875901876, |
|
"loss": 0.1332, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"learning_rate": 0.00013924963924963926, |
|
"loss": 0.132, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"learning_rate": 0.00013474025974025974, |
|
"loss": 0.1299, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"learning_rate": 0.00013023088023088021, |
|
"loss": 0.128, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 5.99, |
|
"learning_rate": 0.00012572150072150075, |
|
"loss": 0.131, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"learning_rate": 0.00012121212121212122, |
|
"loss": 0.1173, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"learning_rate": 0.0001167027417027417, |
|
"loss": 0.1195, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"learning_rate": 0.00011219336219336219, |
|
"loss": 0.1184, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"learning_rate": 0.0001076839826839827, |
|
"loss": 0.1226, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"learning_rate": 0.00010317460317460317, |
|
"loss": 0.1182, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"learning_rate": 9.866522366522366e-05, |
|
"loss": 0.1249, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"learning_rate": 9.415584415584417e-05, |
|
"loss": 0.1187, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"learning_rate": 8.964646464646464e-05, |
|
"loss": 0.1214, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"learning_rate": 8.513708513708513e-05, |
|
"loss": 0.1194, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 6.71, |
|
"learning_rate": 8.062770562770564e-05, |
|
"loss": 0.1187, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"learning_rate": 7.611832611832612e-05, |
|
"loss": 0.1168, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"learning_rate": 7.16089466089466e-05, |
|
"loss": 0.12, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"learning_rate": 6.709956709956711e-05, |
|
"loss": 0.1198, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"learning_rate": 6.259018759018759e-05, |
|
"loss": 0.1189, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"learning_rate": 5.808080808080808e-05, |
|
"loss": 0.1116, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"learning_rate": 5.357142857142857e-05, |
|
"loss": 0.1114, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"learning_rate": 4.9062049062049066e-05, |
|
"loss": 0.1108, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 7.29, |
|
"learning_rate": 4.455266955266955e-05, |
|
"loss": 0.1101, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"learning_rate": 4.004329004329004e-05, |
|
"loss": 0.1062, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 7.43, |
|
"learning_rate": 3.553391053391054e-05, |
|
"loss": 0.1141, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"learning_rate": 3.102453102453102e-05, |
|
"loss": 0.1106, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"learning_rate": 2.6515151515151516e-05, |
|
"loss": 0.1105, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"learning_rate": 2.2005772005772003e-05, |
|
"loss": 0.11, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"learning_rate": 1.7496392496392497e-05, |
|
"loss": 0.1117, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"learning_rate": 1.2987012987012988e-05, |
|
"loss": 0.111, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"learning_rate": 8.477633477633478e-06, |
|
"loss": 0.1144, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 7.94, |
|
"learning_rate": 3.968253968253968e-06, |
|
"loss": 0.1126, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"step": 55440, |
|
"total_flos": 9.377895890180506e+17, |
|
"train_loss": 0.18044453823205198, |
|
"train_runtime": 140707.6037, |
|
"train_samples_per_second": 12.609, |
|
"train_steps_per_second": 0.394 |
|
} |
|
], |
|
"max_steps": 55440, |
|
"num_train_epochs": 8, |
|
"total_flos": 9.377895890180506e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|