|
{ |
|
"best_metric": 1.0159491300582886, |
|
"best_model_checkpoint": "./output/checkpoints/2024-06-11_17-52-37/checkpoint-290", |
|
"epoch": 3.0, |
|
"eval_steps": 1, |
|
"global_step": 291, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010309278350515464, |
|
"grad_norm": 2.9251132011413574, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 5.1241, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010309278350515464, |
|
"eval_loss": 5.098568439483643, |
|
"eval_runtime": 10.4217, |
|
"eval_samples_per_second": 11.227, |
|
"eval_steps_per_second": 0.768, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"grad_norm": 3.0249693393707275, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 5.1113, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"eval_loss": 5.070484638214111, |
|
"eval_runtime": 10.4084, |
|
"eval_samples_per_second": 11.241, |
|
"eval_steps_per_second": 0.769, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.030927835051546393, |
|
"grad_norm": 2.92755389213562, |
|
"learning_rate": 4e-05, |
|
"loss": 5.0735, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.030927835051546393, |
|
"eval_loss": 4.946872234344482, |
|
"eval_runtime": 10.4874, |
|
"eval_samples_per_second": 11.156, |
|
"eval_steps_per_second": 0.763, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"grad_norm": 2.851395845413208, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 4.7837, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"eval_loss": 4.642054557800293, |
|
"eval_runtime": 10.4934, |
|
"eval_samples_per_second": 11.15, |
|
"eval_steps_per_second": 0.762, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05154639175257732, |
|
"grad_norm": 3.1197285652160645, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 4.6176, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05154639175257732, |
|
"eval_loss": 4.12791633605957, |
|
"eval_runtime": 10.4684, |
|
"eval_samples_per_second": 11.177, |
|
"eval_steps_per_second": 0.764, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.061855670103092786, |
|
"grad_norm": 3.1395130157470703, |
|
"learning_rate": 8e-05, |
|
"loss": 4.0594, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.061855670103092786, |
|
"eval_loss": 3.4790046215057373, |
|
"eval_runtime": 10.5794, |
|
"eval_samples_per_second": 11.059, |
|
"eval_steps_per_second": 0.756, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07216494845360824, |
|
"grad_norm": 3.2333834171295166, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 3.404, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07216494845360824, |
|
"eval_loss": 2.782741069793701, |
|
"eval_runtime": 10.5173, |
|
"eval_samples_per_second": 11.125, |
|
"eval_steps_per_second": 0.761, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"grad_norm": 3.2030985355377197, |
|
"learning_rate": 0.00010666666666666667, |
|
"loss": 2.9029, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"eval_loss": 2.28562068939209, |
|
"eval_runtime": 10.4516, |
|
"eval_samples_per_second": 11.194, |
|
"eval_steps_per_second": 0.765, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09278350515463918, |
|
"grad_norm": 2.1603894233703613, |
|
"learning_rate": 0.00012, |
|
"loss": 2.4447, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09278350515463918, |
|
"eval_loss": 1.9469707012176514, |
|
"eval_runtime": 10.465, |
|
"eval_samples_per_second": 11.18, |
|
"eval_steps_per_second": 0.764, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"grad_norm": 1.9389182329177856, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 2.0382, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10309278350515463, |
|
"eval_loss": 1.6869398355484009, |
|
"eval_runtime": 10.4967, |
|
"eval_samples_per_second": 11.146, |
|
"eval_steps_per_second": 0.762, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1134020618556701, |
|
"grad_norm": 1.2211577892303467, |
|
"learning_rate": 0.00014666666666666666, |
|
"loss": 1.6799, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1134020618556701, |
|
"eval_loss": 1.535280466079712, |
|
"eval_runtime": 10.4853, |
|
"eval_samples_per_second": 11.159, |
|
"eval_steps_per_second": 0.763, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"grad_norm": 0.9792207479476929, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5287, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"eval_loss": 1.4445769786834717, |
|
"eval_runtime": 10.5693, |
|
"eval_samples_per_second": 11.07, |
|
"eval_steps_per_second": 0.757, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.13402061855670103, |
|
"grad_norm": 0.9309337139129639, |
|
"learning_rate": 0.00017333333333333334, |
|
"loss": 1.5008, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.13402061855670103, |
|
"eval_loss": 1.3715447187423706, |
|
"eval_runtime": 10.4623, |
|
"eval_samples_per_second": 11.183, |
|
"eval_steps_per_second": 0.765, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.14432989690721648, |
|
"grad_norm": 0.747438907623291, |
|
"learning_rate": 0.0001866666666666667, |
|
"loss": 1.3214, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.14432989690721648, |
|
"eval_loss": 1.3052138090133667, |
|
"eval_runtime": 10.5122, |
|
"eval_samples_per_second": 11.13, |
|
"eval_steps_per_second": 0.761, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.15463917525773196, |
|
"grad_norm": 0.26463842391967773, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3528, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.15463917525773196, |
|
"eval_loss": 1.2763464450836182, |
|
"eval_runtime": 10.4896, |
|
"eval_samples_per_second": 11.154, |
|
"eval_steps_per_second": 0.763, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"grad_norm": 0.17043112218379974, |
|
"learning_rate": 0.00021333333333333333, |
|
"loss": 1.2342, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"eval_loss": 1.2569950819015503, |
|
"eval_runtime": 10.4912, |
|
"eval_samples_per_second": 11.152, |
|
"eval_steps_per_second": 0.763, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.17525773195876287, |
|
"grad_norm": 0.22135448455810547, |
|
"learning_rate": 0.00022666666666666668, |
|
"loss": 1.2609, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.17525773195876287, |
|
"eval_loss": 1.2406532764434814, |
|
"eval_runtime": 10.4356, |
|
"eval_samples_per_second": 11.212, |
|
"eval_steps_per_second": 0.767, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.18556701030927836, |
|
"grad_norm": 0.23974722623825073, |
|
"learning_rate": 0.00024, |
|
"loss": 1.2168, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.18556701030927836, |
|
"eval_loss": 1.2262840270996094, |
|
"eval_runtime": 10.5096, |
|
"eval_samples_per_second": 11.133, |
|
"eval_steps_per_second": 0.761, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1958762886597938, |
|
"grad_norm": 0.2552329897880554, |
|
"learning_rate": 0.00025333333333333333, |
|
"loss": 1.1393, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1958762886597938, |
|
"eval_loss": 1.2250046730041504, |
|
"eval_runtime": 10.4642, |
|
"eval_samples_per_second": 11.181, |
|
"eval_steps_per_second": 0.765, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.20864956080913544, |
|
"learning_rate": 0.0002666666666666667, |
|
"loss": 1.1785, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"eval_loss": 1.2701600790023804, |
|
"eval_runtime": 10.4873, |
|
"eval_samples_per_second": 11.156, |
|
"eval_steps_per_second": 0.763, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21649484536082475, |
|
"grad_norm": 0.294758141040802, |
|
"learning_rate": 0.00028, |
|
"loss": 1.1298, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.21649484536082475, |
|
"eval_loss": 1.2798519134521484, |
|
"eval_runtime": 10.471, |
|
"eval_samples_per_second": 11.174, |
|
"eval_steps_per_second": 0.764, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2268041237113402, |
|
"grad_norm": 0.2878085970878601, |
|
"learning_rate": 0.0002933333333333333, |
|
"loss": 1.1256, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2268041237113402, |
|
"eval_loss": 1.2421809434890747, |
|
"eval_runtime": 10.4389, |
|
"eval_samples_per_second": 11.208, |
|
"eval_steps_per_second": 0.766, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.23711340206185566, |
|
"grad_norm": 0.3116024136543274, |
|
"learning_rate": 0.0003066666666666667, |
|
"loss": 1.0976, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.23711340206185566, |
|
"eval_loss": 1.1896957159042358, |
|
"eval_runtime": 10.4699, |
|
"eval_samples_per_second": 11.175, |
|
"eval_steps_per_second": 0.764, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"grad_norm": 0.16644561290740967, |
|
"learning_rate": 0.00032, |
|
"loss": 1.1107, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"eval_loss": 1.1653326749801636, |
|
"eval_runtime": 10.4442, |
|
"eval_samples_per_second": 11.202, |
|
"eval_steps_per_second": 0.766, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"grad_norm": 0.1626550853252411, |
|
"learning_rate": 0.0003333333333333334, |
|
"loss": 1.0437, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.25773195876288657, |
|
"eval_loss": 1.1528897285461426, |
|
"eval_runtime": 10.5335, |
|
"eval_samples_per_second": 11.107, |
|
"eval_steps_per_second": 0.759, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.26804123711340205, |
|
"grad_norm": 0.12697236239910126, |
|
"learning_rate": 0.00034666666666666667, |
|
"loss": 1.0751, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.26804123711340205, |
|
"eval_loss": 1.142869234085083, |
|
"eval_runtime": 10.5783, |
|
"eval_samples_per_second": 11.06, |
|
"eval_steps_per_second": 0.756, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.27835051546391754, |
|
"grad_norm": 0.1465471386909485, |
|
"learning_rate": 0.00036, |
|
"loss": 1.0272, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.27835051546391754, |
|
"eval_loss": 1.1384135484695435, |
|
"eval_runtime": 10.4237, |
|
"eval_samples_per_second": 11.224, |
|
"eval_steps_per_second": 0.767, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"grad_norm": 0.0951693132519722, |
|
"learning_rate": 0.0003733333333333334, |
|
"loss": 1.0647, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"eval_loss": 1.139052391052246, |
|
"eval_runtime": 10.4739, |
|
"eval_samples_per_second": 11.171, |
|
"eval_steps_per_second": 0.764, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.29896907216494845, |
|
"grad_norm": 0.08118291944265366, |
|
"learning_rate": 0.00038666666666666667, |
|
"loss": 0.9765, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.29896907216494845, |
|
"eval_loss": 1.143639326095581, |
|
"eval_runtime": 10.4846, |
|
"eval_samples_per_second": 11.159, |
|
"eval_steps_per_second": 0.763, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"grad_norm": 0.13394220173358917, |
|
"learning_rate": 0.0004, |
|
"loss": 0.9907, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.30927835051546393, |
|
"eval_loss": 1.1391726732254028, |
|
"eval_runtime": 10.4731, |
|
"eval_samples_per_second": 11.172, |
|
"eval_steps_per_second": 0.764, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.31958762886597936, |
|
"grad_norm": 0.08684772998094559, |
|
"learning_rate": 0.0003984674329501916, |
|
"loss": 1.0127, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.31958762886597936, |
|
"eval_loss": 1.1274006366729736, |
|
"eval_runtime": 10.5272, |
|
"eval_samples_per_second": 11.114, |
|
"eval_steps_per_second": 0.76, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"grad_norm": 0.0940256267786026, |
|
"learning_rate": 0.0003969348659003832, |
|
"loss": 1.0585, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"eval_loss": 1.1143975257873535, |
|
"eval_runtime": 10.4934, |
|
"eval_samples_per_second": 11.15, |
|
"eval_steps_per_second": 0.762, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3402061855670103, |
|
"grad_norm": 0.11045433580875397, |
|
"learning_rate": 0.00039540229885057476, |
|
"loss": 0.9927, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3402061855670103, |
|
"eval_loss": 1.108750820159912, |
|
"eval_runtime": 10.5002, |
|
"eval_samples_per_second": 11.143, |
|
"eval_steps_per_second": 0.762, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.35051546391752575, |
|
"grad_norm": 0.1321028620004654, |
|
"learning_rate": 0.0003938697318007663, |
|
"loss": 0.9783, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.35051546391752575, |
|
"eval_loss": 1.1098194122314453, |
|
"eval_runtime": 10.4742, |
|
"eval_samples_per_second": 11.17, |
|
"eval_steps_per_second": 0.764, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.36082474226804123, |
|
"grad_norm": 0.08724867552518845, |
|
"learning_rate": 0.00039233716475095787, |
|
"loss": 0.9711, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.36082474226804123, |
|
"eval_loss": 1.1187121868133545, |
|
"eval_runtime": 10.4392, |
|
"eval_samples_per_second": 11.208, |
|
"eval_steps_per_second": 0.766, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"grad_norm": 0.07306953519582748, |
|
"learning_rate": 0.00039080459770114945, |
|
"loss": 0.9849, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"eval_loss": 1.1284778118133545, |
|
"eval_runtime": 10.4898, |
|
"eval_samples_per_second": 11.154, |
|
"eval_steps_per_second": 0.763, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.38144329896907214, |
|
"grad_norm": 0.13695846498012543, |
|
"learning_rate": 0.000389272030651341, |
|
"loss": 0.995, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.38144329896907214, |
|
"eval_loss": 1.1155890226364136, |
|
"eval_runtime": 10.4661, |
|
"eval_samples_per_second": 11.179, |
|
"eval_steps_per_second": 0.764, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3917525773195876, |
|
"grad_norm": 0.1091228649020195, |
|
"learning_rate": 0.00038773946360153255, |
|
"loss": 0.9668, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3917525773195876, |
|
"eval_loss": 1.0994709730148315, |
|
"eval_runtime": 10.4468, |
|
"eval_samples_per_second": 11.2, |
|
"eval_steps_per_second": 0.766, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.4020618556701031, |
|
"grad_norm": 0.11535066366195679, |
|
"learning_rate": 0.0003862068965517242, |
|
"loss": 0.972, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4020618556701031, |
|
"eval_loss": 1.0879756212234497, |
|
"eval_runtime": 10.4551, |
|
"eval_samples_per_second": 11.191, |
|
"eval_steps_per_second": 0.765, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.09540507197380066, |
|
"learning_rate": 0.0003846743295019157, |
|
"loss": 0.9808, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"eval_loss": 1.0829530954360962, |
|
"eval_runtime": 10.5393, |
|
"eval_samples_per_second": 11.101, |
|
"eval_steps_per_second": 0.759, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.422680412371134, |
|
"grad_norm": 0.06799206882715225, |
|
"learning_rate": 0.0003831417624521073, |
|
"loss": 1.014, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.422680412371134, |
|
"eval_loss": 1.079045295715332, |
|
"eval_runtime": 10.6082, |
|
"eval_samples_per_second": 11.029, |
|
"eval_steps_per_second": 0.754, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4329896907216495, |
|
"grad_norm": 0.13459570705890656, |
|
"learning_rate": 0.00038160919540229887, |
|
"loss": 0.9361, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4329896907216495, |
|
"eval_loss": 1.0783544778823853, |
|
"eval_runtime": 10.4942, |
|
"eval_samples_per_second": 11.149, |
|
"eval_steps_per_second": 0.762, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.44329896907216493, |
|
"grad_norm": 0.05785546079277992, |
|
"learning_rate": 0.00038007662835249045, |
|
"loss": 1.0128, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.44329896907216493, |
|
"eval_loss": 1.079077124595642, |
|
"eval_runtime": 10.4891, |
|
"eval_samples_per_second": 11.154, |
|
"eval_steps_per_second": 0.763, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"grad_norm": 0.08753732591867447, |
|
"learning_rate": 0.00037854406130268203, |
|
"loss": 0.9054, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"eval_loss": 1.085062861442566, |
|
"eval_runtime": 10.5537, |
|
"eval_samples_per_second": 11.086, |
|
"eval_steps_per_second": 0.758, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4639175257731959, |
|
"grad_norm": 0.09352148324251175, |
|
"learning_rate": 0.00037701149425287356, |
|
"loss": 0.9352, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4639175257731959, |
|
"eval_loss": 1.0897772312164307, |
|
"eval_runtime": 10.5611, |
|
"eval_samples_per_second": 11.078, |
|
"eval_steps_per_second": 0.757, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4742268041237113, |
|
"grad_norm": 0.11168442666530609, |
|
"learning_rate": 0.0003754789272030652, |
|
"loss": 0.9017, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4742268041237113, |
|
"eval_loss": 1.086748719215393, |
|
"eval_runtime": 10.4662, |
|
"eval_samples_per_second": 11.179, |
|
"eval_steps_per_second": 0.764, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4845360824742268, |
|
"grad_norm": 0.11507149785757065, |
|
"learning_rate": 0.0003739463601532567, |
|
"loss": 0.9337, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4845360824742268, |
|
"eval_loss": 1.0779129266738892, |
|
"eval_runtime": 10.5162, |
|
"eval_samples_per_second": 11.126, |
|
"eval_steps_per_second": 0.761, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"grad_norm": 0.0573166199028492, |
|
"learning_rate": 0.0003724137931034483, |
|
"loss": 0.9118, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"eval_loss": 1.0729608535766602, |
|
"eval_runtime": 10.5062, |
|
"eval_samples_per_second": 11.136, |
|
"eval_steps_per_second": 0.761, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5051546391752577, |
|
"grad_norm": 0.07281557470560074, |
|
"learning_rate": 0.0003708812260536399, |
|
"loss": 0.8977, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5051546391752577, |
|
"eval_loss": 1.0709476470947266, |
|
"eval_runtime": 10.5051, |
|
"eval_samples_per_second": 11.137, |
|
"eval_steps_per_second": 0.762, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"grad_norm": 0.06198257952928543, |
|
"learning_rate": 0.00036934865900383146, |
|
"loss": 0.9764, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5154639175257731, |
|
"eval_loss": 1.0682926177978516, |
|
"eval_runtime": 10.4668, |
|
"eval_samples_per_second": 11.178, |
|
"eval_steps_per_second": 0.764, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5257731958762887, |
|
"grad_norm": 0.09327750653028488, |
|
"learning_rate": 0.000367816091954023, |
|
"loss": 0.9107, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5257731958762887, |
|
"eval_loss": 1.0677086114883423, |
|
"eval_runtime": 10.5704, |
|
"eval_samples_per_second": 11.069, |
|
"eval_steps_per_second": 0.757, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"grad_norm": 0.06776881217956543, |
|
"learning_rate": 0.00036628352490421457, |
|
"loss": 0.9727, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"eval_loss": 1.0676703453063965, |
|
"eval_runtime": 10.4148, |
|
"eval_samples_per_second": 11.234, |
|
"eval_steps_per_second": 0.768, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5463917525773195, |
|
"grad_norm": 0.053733568638563156, |
|
"learning_rate": 0.00036475095785440615, |
|
"loss": 0.9463, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5463917525773195, |
|
"eval_loss": 1.0680439472198486, |
|
"eval_runtime": 10.5221, |
|
"eval_samples_per_second": 11.119, |
|
"eval_steps_per_second": 0.76, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5567010309278351, |
|
"grad_norm": 0.05755281820893288, |
|
"learning_rate": 0.0003632183908045977, |
|
"loss": 0.9448, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5567010309278351, |
|
"eval_loss": 1.0675647258758545, |
|
"eval_runtime": 10.4908, |
|
"eval_samples_per_second": 11.153, |
|
"eval_steps_per_second": 0.763, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5670103092783505, |
|
"grad_norm": 0.055574752390384674, |
|
"learning_rate": 0.00036168582375478925, |
|
"loss": 0.9258, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5670103092783505, |
|
"eval_loss": 1.0677154064178467, |
|
"eval_runtime": 10.4872, |
|
"eval_samples_per_second": 11.156, |
|
"eval_steps_per_second": 0.763, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"grad_norm": 0.0719585120677948, |
|
"learning_rate": 0.0003601532567049809, |
|
"loss": 0.9722, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"eval_loss": 1.0657141208648682, |
|
"eval_runtime": 10.5334, |
|
"eval_samples_per_second": 11.107, |
|
"eval_steps_per_second": 0.759, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5876288659793815, |
|
"grad_norm": 0.07973869144916534, |
|
"learning_rate": 0.0003586206896551724, |
|
"loss": 0.869, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5876288659793815, |
|
"eval_loss": 1.0622156858444214, |
|
"eval_runtime": 10.5573, |
|
"eval_samples_per_second": 11.082, |
|
"eval_steps_per_second": 0.758, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5979381443298969, |
|
"grad_norm": 0.061950717121362686, |
|
"learning_rate": 0.000357088122605364, |
|
"loss": 0.8958, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5979381443298969, |
|
"eval_loss": 1.0592986345291138, |
|
"eval_runtime": 10.5297, |
|
"eval_samples_per_second": 11.111, |
|
"eval_steps_per_second": 0.76, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6082474226804123, |
|
"grad_norm": 0.05361785739660263, |
|
"learning_rate": 0.00035555555555555557, |
|
"loss": 1.0036, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6082474226804123, |
|
"eval_loss": 1.0570234060287476, |
|
"eval_runtime": 10.5182, |
|
"eval_samples_per_second": 11.124, |
|
"eval_steps_per_second": 0.761, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.06277985125780106, |
|
"learning_rate": 0.00035402298850574715, |
|
"loss": 0.9306, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"eval_loss": 1.055842638015747, |
|
"eval_runtime": 10.5203, |
|
"eval_samples_per_second": 11.121, |
|
"eval_steps_per_second": 0.76, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6288659793814433, |
|
"grad_norm": 0.0692889615893364, |
|
"learning_rate": 0.00035249042145593873, |
|
"loss": 0.8784, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6288659793814433, |
|
"eval_loss": 1.0559808015823364, |
|
"eval_runtime": 10.431, |
|
"eval_samples_per_second": 11.217, |
|
"eval_steps_per_second": 0.767, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.6391752577319587, |
|
"grad_norm": 0.05878995358943939, |
|
"learning_rate": 0.00035095785440613026, |
|
"loss": 0.9039, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6391752577319587, |
|
"eval_loss": 1.056486964225769, |
|
"eval_runtime": 10.4928, |
|
"eval_samples_per_second": 11.151, |
|
"eval_steps_per_second": 0.762, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6494845360824743, |
|
"grad_norm": 0.05501972511410713, |
|
"learning_rate": 0.0003494252873563219, |
|
"loss": 0.9706, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6494845360824743, |
|
"eval_loss": 1.0573745965957642, |
|
"eval_runtime": 10.5265, |
|
"eval_samples_per_second": 11.115, |
|
"eval_steps_per_second": 0.76, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"grad_norm": 0.06880860775709152, |
|
"learning_rate": 0.0003478927203065134, |
|
"loss": 0.8881, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"eval_loss": 1.0568649768829346, |
|
"eval_runtime": 10.4978, |
|
"eval_samples_per_second": 11.145, |
|
"eval_steps_per_second": 0.762, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6701030927835051, |
|
"grad_norm": 0.07319581508636475, |
|
"learning_rate": 0.000346360153256705, |
|
"loss": 0.9685, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6701030927835051, |
|
"eval_loss": 1.0544594526290894, |
|
"eval_runtime": 10.4691, |
|
"eval_samples_per_second": 11.176, |
|
"eval_steps_per_second": 0.764, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6804123711340206, |
|
"grad_norm": 0.05939271301031113, |
|
"learning_rate": 0.0003448275862068965, |
|
"loss": 0.9455, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6804123711340206, |
|
"eval_loss": 1.0524115562438965, |
|
"eval_runtime": 10.4814, |
|
"eval_samples_per_second": 11.163, |
|
"eval_steps_per_second": 0.763, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6907216494845361, |
|
"grad_norm": 0.05821354687213898, |
|
"learning_rate": 0.00034329501915708816, |
|
"loss": 0.9367, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6907216494845361, |
|
"eval_loss": 1.050156831741333, |
|
"eval_runtime": 10.4911, |
|
"eval_samples_per_second": 11.152, |
|
"eval_steps_per_second": 0.763, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"grad_norm": 0.05590186268091202, |
|
"learning_rate": 0.0003417624521072797, |
|
"loss": 0.9517, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"eval_loss": 1.0487135648727417, |
|
"eval_runtime": 10.496, |
|
"eval_samples_per_second": 11.147, |
|
"eval_steps_per_second": 0.762, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.711340206185567, |
|
"grad_norm": 0.07057742774486542, |
|
"learning_rate": 0.00034022988505747127, |
|
"loss": 0.9506, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.711340206185567, |
|
"eval_loss": 1.0481261014938354, |
|
"eval_runtime": 10.3816, |
|
"eval_samples_per_second": 11.27, |
|
"eval_steps_per_second": 0.771, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"grad_norm": 0.06246848404407501, |
|
"learning_rate": 0.00033869731800766285, |
|
"loss": 0.921, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7216494845360825, |
|
"eval_loss": 1.0485072135925293, |
|
"eval_runtime": 10.4515, |
|
"eval_samples_per_second": 11.195, |
|
"eval_steps_per_second": 0.765, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7319587628865979, |
|
"grad_norm": 0.07238463312387466, |
|
"learning_rate": 0.0003371647509578544, |
|
"loss": 0.9263, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7319587628865979, |
|
"eval_loss": 1.0510802268981934, |
|
"eval_runtime": 10.4869, |
|
"eval_samples_per_second": 11.157, |
|
"eval_steps_per_second": 0.763, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"grad_norm": 0.06434188038110733, |
|
"learning_rate": 0.000335632183908046, |
|
"loss": 0.8957, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"eval_loss": 1.053918719291687, |
|
"eval_runtime": 10.5261, |
|
"eval_samples_per_second": 11.115, |
|
"eval_steps_per_second": 0.76, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7525773195876289, |
|
"grad_norm": 0.07609722763299942, |
|
"learning_rate": 0.0003340996168582376, |
|
"loss": 0.9347, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7525773195876289, |
|
"eval_loss": 1.0527987480163574, |
|
"eval_runtime": 10.4919, |
|
"eval_samples_per_second": 11.151, |
|
"eval_steps_per_second": 0.762, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7628865979381443, |
|
"grad_norm": 0.09455892443656921, |
|
"learning_rate": 0.00033256704980842917, |
|
"loss": 0.9126, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7628865979381443, |
|
"eval_loss": 1.0476549863815308, |
|
"eval_runtime": 10.5068, |
|
"eval_samples_per_second": 11.136, |
|
"eval_steps_per_second": 0.761, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"grad_norm": 0.059503089636564255, |
|
"learning_rate": 0.0003310344827586207, |
|
"loss": 0.9136, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7731958762886598, |
|
"eval_loss": 1.045257568359375, |
|
"eval_runtime": 10.511, |
|
"eval_samples_per_second": 11.131, |
|
"eval_steps_per_second": 0.761, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"grad_norm": 0.054332371801137924, |
|
"learning_rate": 0.00032950191570881227, |
|
"loss": 0.9197, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"eval_loss": 1.0440593957901, |
|
"eval_runtime": 10.4201, |
|
"eval_samples_per_second": 11.228, |
|
"eval_steps_per_second": 0.768, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.7938144329896907, |
|
"grad_norm": 0.0553855374455452, |
|
"learning_rate": 0.00032796934865900385, |
|
"loss": 0.9388, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7938144329896907, |
|
"eval_loss": 1.0428669452667236, |
|
"eval_runtime": 10.4593, |
|
"eval_samples_per_second": 11.186, |
|
"eval_steps_per_second": 0.765, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.8041237113402062, |
|
"grad_norm": 0.0818842276930809, |
|
"learning_rate": 0.00032643678160919543, |
|
"loss": 0.9308, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8041237113402062, |
|
"eval_loss": 1.0421578884124756, |
|
"eval_runtime": 10.5367, |
|
"eval_samples_per_second": 11.104, |
|
"eval_steps_per_second": 0.759, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8144329896907216, |
|
"grad_norm": 0.06866848468780518, |
|
"learning_rate": 0.00032490421455938696, |
|
"loss": 0.849, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8144329896907216, |
|
"eval_loss": 1.0420634746551514, |
|
"eval_runtime": 10.5688, |
|
"eval_samples_per_second": 11.07, |
|
"eval_steps_per_second": 0.757, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.05988822132349014, |
|
"learning_rate": 0.0003233716475095786, |
|
"loss": 0.9201, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"eval_loss": 1.0427372455596924, |
|
"eval_runtime": 10.5382, |
|
"eval_samples_per_second": 11.102, |
|
"eval_steps_per_second": 0.759, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8350515463917526, |
|
"grad_norm": 0.05520083010196686, |
|
"learning_rate": 0.0003218390804597701, |
|
"loss": 0.8814, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8350515463917526, |
|
"eval_loss": 1.0440542697906494, |
|
"eval_runtime": 10.5235, |
|
"eval_samples_per_second": 11.118, |
|
"eval_steps_per_second": 0.76, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.845360824742268, |
|
"grad_norm": 0.07461666315793991, |
|
"learning_rate": 0.0003203065134099617, |
|
"loss": 0.9074, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.845360824742268, |
|
"eval_loss": 1.043821096420288, |
|
"eval_runtime": 10.4636, |
|
"eval_samples_per_second": 11.182, |
|
"eval_steps_per_second": 0.765, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8556701030927835, |
|
"grad_norm": 0.07602047920227051, |
|
"learning_rate": 0.0003187739463601533, |
|
"loss": 0.9119, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8556701030927835, |
|
"eval_loss": 1.0416185855865479, |
|
"eval_runtime": 10.5099, |
|
"eval_samples_per_second": 11.132, |
|
"eval_steps_per_second": 0.761, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"grad_norm": 0.060698170214891434, |
|
"learning_rate": 0.00031724137931034486, |
|
"loss": 0.9724, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"eval_loss": 1.0396265983581543, |
|
"eval_runtime": 10.5008, |
|
"eval_samples_per_second": 11.142, |
|
"eval_steps_per_second": 0.762, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8762886597938144, |
|
"grad_norm": 0.05891481041908264, |
|
"learning_rate": 0.00031570881226053644, |
|
"loss": 0.8571, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8762886597938144, |
|
"eval_loss": 1.0386606454849243, |
|
"eval_runtime": 10.4816, |
|
"eval_samples_per_second": 11.162, |
|
"eval_steps_per_second": 0.763, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8865979381443299, |
|
"grad_norm": 0.08538112789392471, |
|
"learning_rate": 0.00031417624521072797, |
|
"loss": 0.8994, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8865979381443299, |
|
"eval_loss": 1.0381946563720703, |
|
"eval_runtime": 10.4589, |
|
"eval_samples_per_second": 11.187, |
|
"eval_steps_per_second": 0.765, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.8969072164948454, |
|
"grad_norm": 0.06084613874554634, |
|
"learning_rate": 0.0003126436781609196, |
|
"loss": 0.881, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.8969072164948454, |
|
"eval_loss": 1.038084626197815, |
|
"eval_runtime": 10.456, |
|
"eval_samples_per_second": 11.19, |
|
"eval_steps_per_second": 0.765, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"grad_norm": 0.06337710469961166, |
|
"learning_rate": 0.0003111111111111111, |
|
"loss": 0.8843, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"eval_loss": 1.0385206937789917, |
|
"eval_runtime": 10.4563, |
|
"eval_samples_per_second": 11.189, |
|
"eval_steps_per_second": 0.765, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9175257731958762, |
|
"grad_norm": 0.06351147592067719, |
|
"learning_rate": 0.0003095785440613027, |
|
"loss": 0.9098, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9175257731958762, |
|
"eval_loss": 1.038271427154541, |
|
"eval_runtime": 10.5167, |
|
"eval_samples_per_second": 11.125, |
|
"eval_steps_per_second": 0.761, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"grad_norm": 0.062183577567338943, |
|
"learning_rate": 0.00030804597701149423, |
|
"loss": 0.9144, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9278350515463918, |
|
"eval_loss": 1.037765383720398, |
|
"eval_runtime": 10.5379, |
|
"eval_samples_per_second": 11.103, |
|
"eval_steps_per_second": 0.759, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9381443298969072, |
|
"grad_norm": 0.062193334102630615, |
|
"learning_rate": 0.00030651340996168587, |
|
"loss": 0.9341, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9381443298969072, |
|
"eval_loss": 1.0370410680770874, |
|
"eval_runtime": 10.4755, |
|
"eval_samples_per_second": 11.169, |
|
"eval_steps_per_second": 0.764, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"grad_norm": 0.06498893350362778, |
|
"learning_rate": 0.0003049808429118774, |
|
"loss": 0.8414, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"eval_loss": 1.0365617275238037, |
|
"eval_runtime": 10.5155, |
|
"eval_samples_per_second": 11.126, |
|
"eval_steps_per_second": 0.761, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9587628865979382, |
|
"grad_norm": 0.06492085754871368, |
|
"learning_rate": 0.00030344827586206897, |
|
"loss": 0.9023, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9587628865979382, |
|
"eval_loss": 1.0356723070144653, |
|
"eval_runtime": 10.5274, |
|
"eval_samples_per_second": 11.114, |
|
"eval_steps_per_second": 0.76, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9690721649484536, |
|
"grad_norm": 0.06767349690198898, |
|
"learning_rate": 0.00030191570881226055, |
|
"loss": 0.949, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.9690721649484536, |
|
"eval_loss": 1.0351808071136475, |
|
"eval_runtime": 10.516, |
|
"eval_samples_per_second": 11.126, |
|
"eval_steps_per_second": 0.761, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.979381443298969, |
|
"grad_norm": 0.0619075745344162, |
|
"learning_rate": 0.00030038314176245213, |
|
"loss": 0.8979, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.979381443298969, |
|
"eval_loss": 1.0349719524383545, |
|
"eval_runtime": 10.5134, |
|
"eval_samples_per_second": 11.129, |
|
"eval_steps_per_second": 0.761, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"grad_norm": 0.06217743828892708, |
|
"learning_rate": 0.00029885057471264366, |
|
"loss": 0.9299, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"eval_loss": 1.034669280052185, |
|
"eval_runtime": 10.469, |
|
"eval_samples_per_second": 11.176, |
|
"eval_steps_per_second": 0.764, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.10211849957704544, |
|
"learning_rate": 0.0002973180076628353, |
|
"loss": 0.8179, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.0358762741088867, |
|
"eval_runtime": 10.4811, |
|
"eval_samples_per_second": 11.163, |
|
"eval_steps_per_second": 0.763, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.0103092783505154, |
|
"grad_norm": 0.06400294601917267, |
|
"learning_rate": 0.0002957854406130268, |
|
"loss": 0.9005, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.0103092783505154, |
|
"eval_loss": 1.0370227098464966, |
|
"eval_runtime": 10.5474, |
|
"eval_samples_per_second": 11.093, |
|
"eval_steps_per_second": 0.758, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.0206185567010309, |
|
"grad_norm": 0.06539759784936905, |
|
"learning_rate": 0.0002942528735632184, |
|
"loss": 0.8716, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.0206185567010309, |
|
"eval_loss": 1.0371029376983643, |
|
"eval_runtime": 10.546, |
|
"eval_samples_per_second": 11.094, |
|
"eval_steps_per_second": 0.759, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.0309278350515463, |
|
"grad_norm": 0.06311202794313431, |
|
"learning_rate": 0.00029272030651341, |
|
"loss": 0.8755, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0309278350515463, |
|
"eval_loss": 1.0364102125167847, |
|
"eval_runtime": 10.5579, |
|
"eval_samples_per_second": 11.082, |
|
"eval_steps_per_second": 0.758, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.041237113402062, |
|
"grad_norm": 0.065621517598629, |
|
"learning_rate": 0.00029118773946360156, |
|
"loss": 0.8771, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.041237113402062, |
|
"eval_loss": 1.0352747440338135, |
|
"eval_runtime": 10.4611, |
|
"eval_samples_per_second": 11.184, |
|
"eval_steps_per_second": 0.765, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.0515463917525774, |
|
"grad_norm": 0.05740601569414139, |
|
"learning_rate": 0.00028965517241379314, |
|
"loss": 0.8737, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.0515463917525774, |
|
"eval_loss": 1.034127950668335, |
|
"eval_runtime": 10.5261, |
|
"eval_samples_per_second": 11.115, |
|
"eval_steps_per_second": 0.76, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.0618556701030928, |
|
"grad_norm": 0.06578749418258667, |
|
"learning_rate": 0.00028812260536398467, |
|
"loss": 0.9053, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.0618556701030928, |
|
"eval_loss": 1.033166527748108, |
|
"eval_runtime": 10.4272, |
|
"eval_samples_per_second": 11.221, |
|
"eval_steps_per_second": 0.767, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.0721649484536082, |
|
"grad_norm": 0.06333158165216446, |
|
"learning_rate": 0.0002865900383141763, |
|
"loss": 0.898, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.0721649484536082, |
|
"eval_loss": 1.0324461460113525, |
|
"eval_runtime": 10.459, |
|
"eval_samples_per_second": 11.187, |
|
"eval_steps_per_second": 0.765, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.0824742268041236, |
|
"grad_norm": 0.06329383701086044, |
|
"learning_rate": 0.0002850574712643678, |
|
"loss": 0.8933, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.0824742268041236, |
|
"eval_loss": 1.0319868326187134, |
|
"eval_runtime": 10.501, |
|
"eval_samples_per_second": 11.142, |
|
"eval_steps_per_second": 0.762, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.0927835051546393, |
|
"grad_norm": 0.06855228543281555, |
|
"learning_rate": 0.0002835249042145594, |
|
"loss": 0.8557, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.0927835051546393, |
|
"eval_loss": 1.0321075916290283, |
|
"eval_runtime": 10.4503, |
|
"eval_samples_per_second": 11.196, |
|
"eval_steps_per_second": 0.766, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.1030927835051547, |
|
"grad_norm": 0.06301665306091309, |
|
"learning_rate": 0.00028199233716475093, |
|
"loss": 0.8766, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.1030927835051547, |
|
"eval_loss": 1.0322858095169067, |
|
"eval_runtime": 10.4921, |
|
"eval_samples_per_second": 11.151, |
|
"eval_steps_per_second": 0.762, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.1134020618556701, |
|
"grad_norm": 0.07938443869352341, |
|
"learning_rate": 0.00028045977011494257, |
|
"loss": 0.8659, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.1134020618556701, |
|
"eval_loss": 1.03269362449646, |
|
"eval_runtime": 10.4695, |
|
"eval_samples_per_second": 11.175, |
|
"eval_steps_per_second": 0.764, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.1237113402061856, |
|
"grad_norm": 0.06885319948196411, |
|
"learning_rate": 0.0002789272030651341, |
|
"loss": 0.854, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.1237113402061856, |
|
"eval_loss": 1.0330955982208252, |
|
"eval_runtime": 10.538, |
|
"eval_samples_per_second": 11.103, |
|
"eval_steps_per_second": 0.759, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.134020618556701, |
|
"grad_norm": 0.06268304586410522, |
|
"learning_rate": 0.00027739463601532567, |
|
"loss": 0.9008, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.134020618556701, |
|
"eval_loss": 1.0327825546264648, |
|
"eval_runtime": 10.4863, |
|
"eval_samples_per_second": 11.157, |
|
"eval_steps_per_second": 0.763, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1443298969072164, |
|
"grad_norm": 0.07182294875383377, |
|
"learning_rate": 0.00027586206896551725, |
|
"loss": 0.8676, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.1443298969072164, |
|
"eval_loss": 1.03208327293396, |
|
"eval_runtime": 10.5334, |
|
"eval_samples_per_second": 11.108, |
|
"eval_steps_per_second": 0.759, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.1546391752577319, |
|
"grad_norm": 0.07009400427341461, |
|
"learning_rate": 0.00027432950191570883, |
|
"loss": 0.8566, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.1546391752577319, |
|
"eval_loss": 1.0315619707107544, |
|
"eval_runtime": 10.5125, |
|
"eval_samples_per_second": 11.13, |
|
"eval_steps_per_second": 0.761, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.1649484536082475, |
|
"grad_norm": 0.06975871324539185, |
|
"learning_rate": 0.0002727969348659004, |
|
"loss": 0.8413, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.1649484536082475, |
|
"eval_loss": 1.0310561656951904, |
|
"eval_runtime": 10.4956, |
|
"eval_samples_per_second": 11.148, |
|
"eval_steps_per_second": 0.762, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.175257731958763, |
|
"grad_norm": 0.06924448907375336, |
|
"learning_rate": 0.00027126436781609194, |
|
"loss": 0.912, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.175257731958763, |
|
"eval_loss": 1.0305217504501343, |
|
"eval_runtime": 10.5285, |
|
"eval_samples_per_second": 11.113, |
|
"eval_steps_per_second": 0.76, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.1855670103092784, |
|
"grad_norm": 0.0842827558517456, |
|
"learning_rate": 0.0002697318007662836, |
|
"loss": 0.8783, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.1855670103092784, |
|
"eval_loss": 1.0306683778762817, |
|
"eval_runtime": 10.5854, |
|
"eval_samples_per_second": 11.053, |
|
"eval_steps_per_second": 0.756, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.1958762886597938, |
|
"grad_norm": 0.0670529305934906, |
|
"learning_rate": 0.0002681992337164751, |
|
"loss": 0.8553, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.1958762886597938, |
|
"eval_loss": 1.0308029651641846, |
|
"eval_runtime": 10.5086, |
|
"eval_samples_per_second": 11.134, |
|
"eval_steps_per_second": 0.761, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.2061855670103092, |
|
"grad_norm": 0.07547353953123093, |
|
"learning_rate": 0.0002666666666666667, |
|
"loss": 0.85, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.2061855670103092, |
|
"eval_loss": 1.0305339097976685, |
|
"eval_runtime": 10.4633, |
|
"eval_samples_per_second": 11.182, |
|
"eval_steps_per_second": 0.765, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.2164948453608249, |
|
"grad_norm": 0.0778818428516388, |
|
"learning_rate": 0.00026513409961685826, |
|
"loss": 0.9108, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.2164948453608249, |
|
"eval_loss": 1.0304737091064453, |
|
"eval_runtime": 10.4445, |
|
"eval_samples_per_second": 11.202, |
|
"eval_steps_per_second": 0.766, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.2268041237113403, |
|
"grad_norm": 0.07250794023275375, |
|
"learning_rate": 0.00026360153256704984, |
|
"loss": 0.9108, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.2268041237113403, |
|
"eval_loss": 1.029858946800232, |
|
"eval_runtime": 10.539, |
|
"eval_samples_per_second": 11.102, |
|
"eval_steps_per_second": 0.759, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.2371134020618557, |
|
"grad_norm": 0.07320624589920044, |
|
"learning_rate": 0.00026206896551724137, |
|
"loss": 0.862, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2371134020618557, |
|
"eval_loss": 1.029157280921936, |
|
"eval_runtime": 10.5256, |
|
"eval_samples_per_second": 11.116, |
|
"eval_steps_per_second": 0.76, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2474226804123711, |
|
"grad_norm": 0.08917060494422913, |
|
"learning_rate": 0.00026053639846743295, |
|
"loss": 0.8968, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.2474226804123711, |
|
"eval_loss": 1.028914451599121, |
|
"eval_runtime": 10.486, |
|
"eval_samples_per_second": 11.158, |
|
"eval_steps_per_second": 0.763, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.2577319587628866, |
|
"grad_norm": 0.08569680899381638, |
|
"learning_rate": 0.0002590038314176245, |
|
"loss": 0.8094, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.2577319587628866, |
|
"eval_loss": 1.028579831123352, |
|
"eval_runtime": 10.5652, |
|
"eval_samples_per_second": 11.074, |
|
"eval_steps_per_second": 0.757, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.268041237113402, |
|
"grad_norm": 0.08444651961326599, |
|
"learning_rate": 0.0002574712643678161, |
|
"loss": 0.935, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.268041237113402, |
|
"eval_loss": 1.0277479887008667, |
|
"eval_runtime": 10.4997, |
|
"eval_samples_per_second": 11.143, |
|
"eval_steps_per_second": 0.762, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.2783505154639174, |
|
"grad_norm": 0.07106074690818787, |
|
"learning_rate": 0.00025593869731800763, |
|
"loss": 0.8813, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.2783505154639174, |
|
"eval_loss": 1.027204155921936, |
|
"eval_runtime": 10.503, |
|
"eval_samples_per_second": 11.14, |
|
"eval_steps_per_second": 0.762, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.2886597938144329, |
|
"grad_norm": 0.06486649811267853, |
|
"learning_rate": 0.00025440613026819927, |
|
"loss": 0.8593, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.2886597938144329, |
|
"eval_loss": 1.0267640352249146, |
|
"eval_runtime": 10.5616, |
|
"eval_samples_per_second": 11.078, |
|
"eval_steps_per_second": 0.757, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.2989690721649485, |
|
"grad_norm": 0.0726565569639206, |
|
"learning_rate": 0.0002528735632183908, |
|
"loss": 0.846, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.2989690721649485, |
|
"eval_loss": 1.026572585105896, |
|
"eval_runtime": 10.5811, |
|
"eval_samples_per_second": 11.057, |
|
"eval_steps_per_second": 0.756, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.309278350515464, |
|
"grad_norm": 0.0767112746834755, |
|
"learning_rate": 0.00025134099616858237, |
|
"loss": 0.8773, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.309278350515464, |
|
"eval_loss": 1.0265377759933472, |
|
"eval_runtime": 10.4421, |
|
"eval_samples_per_second": 11.205, |
|
"eval_steps_per_second": 0.766, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.3195876288659794, |
|
"grad_norm": 0.07738649845123291, |
|
"learning_rate": 0.00024980842911877395, |
|
"loss": 0.8903, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.3195876288659794, |
|
"eval_loss": 1.0261329412460327, |
|
"eval_runtime": 10.5383, |
|
"eval_samples_per_second": 11.102, |
|
"eval_steps_per_second": 0.759, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.3298969072164948, |
|
"grad_norm": 0.07571049779653549, |
|
"learning_rate": 0.00024827586206896553, |
|
"loss": 0.83, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.3298969072164948, |
|
"eval_loss": 1.0259356498718262, |
|
"eval_runtime": 10.478, |
|
"eval_samples_per_second": 11.166, |
|
"eval_steps_per_second": 0.764, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.3402061855670104, |
|
"grad_norm": 0.07176312804222107, |
|
"learning_rate": 0.0002467432950191571, |
|
"loss": 0.8803, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.3402061855670104, |
|
"eval_loss": 1.026039958000183, |
|
"eval_runtime": 10.4697, |
|
"eval_samples_per_second": 11.175, |
|
"eval_steps_per_second": 0.764, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.3505154639175259, |
|
"grad_norm": 0.08636844158172607, |
|
"learning_rate": 0.00024521072796934864, |
|
"loss": 0.8575, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.3505154639175259, |
|
"eval_loss": 1.0265427827835083, |
|
"eval_runtime": 10.5172, |
|
"eval_samples_per_second": 11.125, |
|
"eval_steps_per_second": 0.761, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.3608247422680413, |
|
"grad_norm": 0.09010273963212967, |
|
"learning_rate": 0.00024367816091954025, |
|
"loss": 0.8508, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.3608247422680413, |
|
"eval_loss": 1.0265592336654663, |
|
"eval_runtime": 10.4674, |
|
"eval_samples_per_second": 11.178, |
|
"eval_steps_per_second": 0.764, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.3711340206185567, |
|
"grad_norm": 0.07448781281709671, |
|
"learning_rate": 0.0002421455938697318, |
|
"loss": 0.9214, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.3711340206185567, |
|
"eval_loss": 1.0265872478485107, |
|
"eval_runtime": 10.5238, |
|
"eval_samples_per_second": 11.118, |
|
"eval_steps_per_second": 0.76, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.3814432989690721, |
|
"grad_norm": 0.07683249562978745, |
|
"learning_rate": 0.00024061302681992338, |
|
"loss": 0.9312, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.3814432989690721, |
|
"eval_loss": 1.0259978771209717, |
|
"eval_runtime": 10.5509, |
|
"eval_samples_per_second": 11.089, |
|
"eval_steps_per_second": 0.758, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.3917525773195876, |
|
"grad_norm": 0.07399426400661469, |
|
"learning_rate": 0.00023908045977011496, |
|
"loss": 0.8931, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.3917525773195876, |
|
"eval_loss": 1.025506615638733, |
|
"eval_runtime": 10.4312, |
|
"eval_samples_per_second": 11.216, |
|
"eval_steps_per_second": 0.767, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.402061855670103, |
|
"grad_norm": 0.0796341821551323, |
|
"learning_rate": 0.00023754789272030654, |
|
"loss": 0.8355, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.402061855670103, |
|
"eval_loss": 1.0256001949310303, |
|
"eval_runtime": 10.4944, |
|
"eval_samples_per_second": 11.149, |
|
"eval_steps_per_second": 0.762, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.4123711340206184, |
|
"grad_norm": 0.08176424354314804, |
|
"learning_rate": 0.0002360153256704981, |
|
"loss": 0.8115, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.4123711340206184, |
|
"eval_loss": 1.0265989303588867, |
|
"eval_runtime": 10.5817, |
|
"eval_samples_per_second": 11.057, |
|
"eval_steps_per_second": 0.756, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.422680412371134, |
|
"grad_norm": 0.08126474916934967, |
|
"learning_rate": 0.00023448275862068965, |
|
"loss": 0.8775, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.422680412371134, |
|
"eval_loss": 1.0275684595108032, |
|
"eval_runtime": 10.4427, |
|
"eval_samples_per_second": 11.204, |
|
"eval_steps_per_second": 0.766, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.4329896907216495, |
|
"grad_norm": 0.07753143459558487, |
|
"learning_rate": 0.00023295019157088125, |
|
"loss": 0.8226, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.4329896907216495, |
|
"eval_loss": 1.0283759832382202, |
|
"eval_runtime": 10.4973, |
|
"eval_samples_per_second": 11.146, |
|
"eval_steps_per_second": 0.762, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.443298969072165, |
|
"grad_norm": 0.08406826108694077, |
|
"learning_rate": 0.0002314176245210728, |
|
"loss": 0.8984, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.443298969072165, |
|
"eval_loss": 1.0285860300064087, |
|
"eval_runtime": 10.5756, |
|
"eval_samples_per_second": 11.063, |
|
"eval_steps_per_second": 0.756, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.4536082474226804, |
|
"grad_norm": 0.0795009657740593, |
|
"learning_rate": 0.00022988505747126436, |
|
"loss": 0.8695, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.4536082474226804, |
|
"eval_loss": 1.0274338722229004, |
|
"eval_runtime": 10.5603, |
|
"eval_samples_per_second": 11.079, |
|
"eval_steps_per_second": 0.758, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.463917525773196, |
|
"grad_norm": 0.08259198069572449, |
|
"learning_rate": 0.00022835249042145597, |
|
"loss": 0.7941, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.463917525773196, |
|
"eval_loss": 1.026221752166748, |
|
"eval_runtime": 10.5319, |
|
"eval_samples_per_second": 11.109, |
|
"eval_steps_per_second": 0.76, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.4742268041237114, |
|
"grad_norm": 0.07429313659667969, |
|
"learning_rate": 0.00022681992337164752, |
|
"loss": 0.8434, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.4742268041237114, |
|
"eval_loss": 1.0248271226882935, |
|
"eval_runtime": 10.4616, |
|
"eval_samples_per_second": 11.184, |
|
"eval_steps_per_second": 0.765, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.4845360824742269, |
|
"grad_norm": 0.07662954181432724, |
|
"learning_rate": 0.00022528735632183907, |
|
"loss": 0.9194, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.4845360824742269, |
|
"eval_loss": 1.0240087509155273, |
|
"eval_runtime": 10.4446, |
|
"eval_samples_per_second": 11.202, |
|
"eval_steps_per_second": 0.766, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.4948453608247423, |
|
"grad_norm": 0.08005265891551971, |
|
"learning_rate": 0.00022375478927203065, |
|
"loss": 0.8534, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.4948453608247423, |
|
"eval_loss": 1.023663878440857, |
|
"eval_runtime": 10.4767, |
|
"eval_samples_per_second": 11.168, |
|
"eval_steps_per_second": 0.764, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.5051546391752577, |
|
"grad_norm": 0.071219302713871, |
|
"learning_rate": 0.00022222222222222223, |
|
"loss": 0.8064, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.5051546391752577, |
|
"eval_loss": 1.0234168767929077, |
|
"eval_runtime": 10.5009, |
|
"eval_samples_per_second": 11.142, |
|
"eval_steps_per_second": 0.762, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.5154639175257731, |
|
"grad_norm": 0.08719812333583832, |
|
"learning_rate": 0.0002206896551724138, |
|
"loss": 0.8327, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.5154639175257731, |
|
"eval_loss": 1.023542046546936, |
|
"eval_runtime": 10.5145, |
|
"eval_samples_per_second": 11.128, |
|
"eval_steps_per_second": 0.761, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.5257731958762886, |
|
"grad_norm": 0.08257601410150528, |
|
"learning_rate": 0.00021915708812260537, |
|
"loss": 0.8003, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.5257731958762886, |
|
"eval_loss": 1.023552417755127, |
|
"eval_runtime": 10.4934, |
|
"eval_samples_per_second": 11.15, |
|
"eval_steps_per_second": 0.762, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.536082474226804, |
|
"grad_norm": 0.08228150010108948, |
|
"learning_rate": 0.00021762452107279697, |
|
"loss": 0.7964, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.536082474226804, |
|
"eval_loss": 1.0236713886260986, |
|
"eval_runtime": 10.4883, |
|
"eval_samples_per_second": 11.155, |
|
"eval_steps_per_second": 0.763, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.5463917525773194, |
|
"grad_norm": 0.07739076763391495, |
|
"learning_rate": 0.00021609195402298853, |
|
"loss": 0.8605, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.5463917525773194, |
|
"eval_loss": 1.0236490964889526, |
|
"eval_runtime": 10.5398, |
|
"eval_samples_per_second": 11.101, |
|
"eval_steps_per_second": 0.759, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.556701030927835, |
|
"grad_norm": 0.08219290524721146, |
|
"learning_rate": 0.00021455938697318008, |
|
"loss": 0.8106, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.556701030927835, |
|
"eval_loss": 1.023714542388916, |
|
"eval_runtime": 10.5173, |
|
"eval_samples_per_second": 11.124, |
|
"eval_steps_per_second": 0.761, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.5670103092783505, |
|
"grad_norm": 0.08360739797353745, |
|
"learning_rate": 0.0002130268199233717, |
|
"loss": 0.8464, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.5670103092783505, |
|
"eval_loss": 1.023202657699585, |
|
"eval_runtime": 10.5109, |
|
"eval_samples_per_second": 11.131, |
|
"eval_steps_per_second": 0.761, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.577319587628866, |
|
"grad_norm": 0.07956969738006592, |
|
"learning_rate": 0.00021149425287356324, |
|
"loss": 0.841, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.577319587628866, |
|
"eval_loss": 1.022517204284668, |
|
"eval_runtime": 10.5317, |
|
"eval_samples_per_second": 11.109, |
|
"eval_steps_per_second": 0.76, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.5876288659793816, |
|
"grad_norm": 0.07877441495656967, |
|
"learning_rate": 0.0002099616858237548, |
|
"loss": 0.8296, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.5876288659793816, |
|
"eval_loss": 1.0217140913009644, |
|
"eval_runtime": 10.4592, |
|
"eval_samples_per_second": 11.186, |
|
"eval_steps_per_second": 0.765, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.597938144329897, |
|
"grad_norm": 0.07251220941543579, |
|
"learning_rate": 0.00020842911877394635, |
|
"loss": 0.8209, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.597938144329897, |
|
"eval_loss": 1.0212225914001465, |
|
"eval_runtime": 10.4313, |
|
"eval_samples_per_second": 11.216, |
|
"eval_steps_per_second": 0.767, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.6082474226804124, |
|
"grad_norm": 0.0771518275141716, |
|
"learning_rate": 0.00020689655172413795, |
|
"loss": 0.8744, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.6082474226804124, |
|
"eval_loss": 1.0210092067718506, |
|
"eval_runtime": 10.4307, |
|
"eval_samples_per_second": 11.217, |
|
"eval_steps_per_second": 0.767, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.6185567010309279, |
|
"grad_norm": 0.08237840235233307, |
|
"learning_rate": 0.0002053639846743295, |
|
"loss": 0.8352, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.6185567010309279, |
|
"eval_loss": 1.0209413766860962, |
|
"eval_runtime": 10.517, |
|
"eval_samples_per_second": 11.125, |
|
"eval_steps_per_second": 0.761, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.6288659793814433, |
|
"grad_norm": 0.07783687859773636, |
|
"learning_rate": 0.00020383141762452106, |
|
"loss": 0.8587, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.6288659793814433, |
|
"eval_loss": 1.0206629037857056, |
|
"eval_runtime": 10.4983, |
|
"eval_samples_per_second": 11.145, |
|
"eval_steps_per_second": 0.762, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.6391752577319587, |
|
"grad_norm": 0.0869293212890625, |
|
"learning_rate": 0.00020229885057471267, |
|
"loss": 0.8434, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.6391752577319587, |
|
"eval_loss": 1.0204321146011353, |
|
"eval_runtime": 10.4141, |
|
"eval_samples_per_second": 11.235, |
|
"eval_steps_per_second": 0.768, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.6494845360824741, |
|
"grad_norm": 0.07775431871414185, |
|
"learning_rate": 0.00020076628352490422, |
|
"loss": 0.8314, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.6494845360824741, |
|
"eval_loss": 1.020195484161377, |
|
"eval_runtime": 10.5726, |
|
"eval_samples_per_second": 11.066, |
|
"eval_steps_per_second": 0.757, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.6597938144329896, |
|
"grad_norm": 0.10192904621362686, |
|
"learning_rate": 0.0001992337164750958, |
|
"loss": 0.8234, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.6597938144329896, |
|
"eval_loss": 1.0198768377304077, |
|
"eval_runtime": 10.4383, |
|
"eval_samples_per_second": 11.209, |
|
"eval_steps_per_second": 0.766, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.670103092783505, |
|
"grad_norm": 0.0781380832195282, |
|
"learning_rate": 0.00019770114942528738, |
|
"loss": 0.8513, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.670103092783505, |
|
"eval_loss": 1.0197436809539795, |
|
"eval_runtime": 10.4615, |
|
"eval_samples_per_second": 11.184, |
|
"eval_steps_per_second": 0.765, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.6804123711340206, |
|
"grad_norm": 0.0860995426774025, |
|
"learning_rate": 0.00019616858237547893, |
|
"loss": 0.8451, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.6804123711340206, |
|
"eval_loss": 1.0204094648361206, |
|
"eval_runtime": 10.4828, |
|
"eval_samples_per_second": 11.161, |
|
"eval_steps_per_second": 0.763, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.690721649484536, |
|
"grad_norm": 0.06817477196455002, |
|
"learning_rate": 0.0001946360153256705, |
|
"loss": 0.8414, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.690721649484536, |
|
"eval_loss": 1.0212379693984985, |
|
"eval_runtime": 10.4766, |
|
"eval_samples_per_second": 11.168, |
|
"eval_steps_per_second": 0.764, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.7010309278350515, |
|
"grad_norm": 0.07095003128051758, |
|
"learning_rate": 0.0001931034482758621, |
|
"loss": 0.8086, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.7010309278350515, |
|
"eval_loss": 1.0223617553710938, |
|
"eval_runtime": 10.5911, |
|
"eval_samples_per_second": 11.047, |
|
"eval_steps_per_second": 0.755, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.7113402061855671, |
|
"grad_norm": 0.0841984674334526, |
|
"learning_rate": 0.00019157088122605365, |
|
"loss": 0.8711, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.7113402061855671, |
|
"eval_loss": 1.0229341983795166, |
|
"eval_runtime": 10.4475, |
|
"eval_samples_per_second": 11.199, |
|
"eval_steps_per_second": 0.766, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.7216494845360826, |
|
"grad_norm": 0.08508795499801636, |
|
"learning_rate": 0.00019003831417624523, |
|
"loss": 0.8881, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.7216494845360826, |
|
"eval_loss": 1.0217504501342773, |
|
"eval_runtime": 10.5074, |
|
"eval_samples_per_second": 11.135, |
|
"eval_steps_per_second": 0.761, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.731958762886598, |
|
"grad_norm": 0.07691670209169388, |
|
"learning_rate": 0.00018850574712643678, |
|
"loss": 0.8287, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.731958762886598, |
|
"eval_loss": 1.0207098722457886, |
|
"eval_runtime": 10.4614, |
|
"eval_samples_per_second": 11.184, |
|
"eval_steps_per_second": 0.765, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.7422680412371134, |
|
"grad_norm": 0.08198527246713638, |
|
"learning_rate": 0.00018697318007662836, |
|
"loss": 0.8492, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.7422680412371134, |
|
"eval_loss": 1.0198174715042114, |
|
"eval_runtime": 10.4728, |
|
"eval_samples_per_second": 11.172, |
|
"eval_steps_per_second": 0.764, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.7525773195876289, |
|
"grad_norm": 0.08822525292634964, |
|
"learning_rate": 0.00018544061302681994, |
|
"loss": 0.9003, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.7525773195876289, |
|
"eval_loss": 1.0188552141189575, |
|
"eval_runtime": 10.5394, |
|
"eval_samples_per_second": 11.101, |
|
"eval_steps_per_second": 0.759, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.7628865979381443, |
|
"grad_norm": 0.08640007674694061, |
|
"learning_rate": 0.0001839080459770115, |
|
"loss": 0.7914, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.7628865979381443, |
|
"eval_loss": 1.0183426141738892, |
|
"eval_runtime": 10.4029, |
|
"eval_samples_per_second": 11.247, |
|
"eval_steps_per_second": 0.769, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.7731958762886597, |
|
"grad_norm": 0.07736896723508835, |
|
"learning_rate": 0.00018237547892720307, |
|
"loss": 0.8571, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.7731958762886597, |
|
"eval_loss": 1.0179463624954224, |
|
"eval_runtime": 10.5066, |
|
"eval_samples_per_second": 11.136, |
|
"eval_steps_per_second": 0.761, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.7835051546391751, |
|
"grad_norm": 0.07718931138515472, |
|
"learning_rate": 0.00018084291187739463, |
|
"loss": 0.8601, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.7835051546391751, |
|
"eval_loss": 1.0176079273223877, |
|
"eval_runtime": 10.548, |
|
"eval_samples_per_second": 11.092, |
|
"eval_steps_per_second": 0.758, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.7938144329896906, |
|
"grad_norm": 0.07540721446275711, |
|
"learning_rate": 0.0001793103448275862, |
|
"loss": 0.8815, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.7938144329896906, |
|
"eval_loss": 1.0175734758377075, |
|
"eval_runtime": 10.4877, |
|
"eval_samples_per_second": 11.156, |
|
"eval_steps_per_second": 0.763, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.8041237113402062, |
|
"grad_norm": 0.0823512077331543, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.8395, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.8041237113402062, |
|
"eval_loss": 1.0178968906402588, |
|
"eval_runtime": 10.5806, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 0.756, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.8144329896907216, |
|
"grad_norm": 0.08408434689044952, |
|
"learning_rate": 0.00017624521072796937, |
|
"loss": 0.847, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.8144329896907216, |
|
"eval_loss": 1.0186214447021484, |
|
"eval_runtime": 10.5083, |
|
"eval_samples_per_second": 11.134, |
|
"eval_steps_per_second": 0.761, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.824742268041237, |
|
"grad_norm": 0.08428458124399185, |
|
"learning_rate": 0.00017471264367816095, |
|
"loss": 0.8796, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.824742268041237, |
|
"eval_loss": 1.0196959972381592, |
|
"eval_runtime": 10.6262, |
|
"eval_samples_per_second": 11.01, |
|
"eval_steps_per_second": 0.753, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.8350515463917527, |
|
"grad_norm": 0.08260627090930939, |
|
"learning_rate": 0.0001731800766283525, |
|
"loss": 0.8258, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.8350515463917527, |
|
"eval_loss": 1.0202312469482422, |
|
"eval_runtime": 10.4706, |
|
"eval_samples_per_second": 11.174, |
|
"eval_steps_per_second": 0.764, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.8453608247422681, |
|
"grad_norm": 0.0770183727145195, |
|
"learning_rate": 0.00017164750957854408, |
|
"loss": 0.8404, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.8453608247422681, |
|
"eval_loss": 1.020593285560608, |
|
"eval_runtime": 10.4834, |
|
"eval_samples_per_second": 11.161, |
|
"eval_steps_per_second": 0.763, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.8556701030927836, |
|
"grad_norm": 0.09574954211711884, |
|
"learning_rate": 0.00017011494252873563, |
|
"loss": 0.8294, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.8556701030927836, |
|
"eval_loss": 1.0196754932403564, |
|
"eval_runtime": 10.5369, |
|
"eval_samples_per_second": 11.104, |
|
"eval_steps_per_second": 0.759, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.865979381443299, |
|
"grad_norm": 0.08152729272842407, |
|
"learning_rate": 0.0001685823754789272, |
|
"loss": 0.835, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.865979381443299, |
|
"eval_loss": 1.01894211769104, |
|
"eval_runtime": 10.4761, |
|
"eval_samples_per_second": 11.168, |
|
"eval_steps_per_second": 0.764, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.8762886597938144, |
|
"grad_norm": 0.08013647794723511, |
|
"learning_rate": 0.0001670498084291188, |
|
"loss": 0.8793, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.8762886597938144, |
|
"eval_loss": 1.018194556236267, |
|
"eval_runtime": 10.5238, |
|
"eval_samples_per_second": 11.118, |
|
"eval_steps_per_second": 0.76, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.8865979381443299, |
|
"grad_norm": 0.0803917795419693, |
|
"learning_rate": 0.00016551724137931035, |
|
"loss": 0.8507, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.8865979381443299, |
|
"eval_loss": 1.0174652338027954, |
|
"eval_runtime": 10.4694, |
|
"eval_samples_per_second": 11.175, |
|
"eval_steps_per_second": 0.764, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.8969072164948453, |
|
"grad_norm": 0.07526881247758865, |
|
"learning_rate": 0.00016398467432950193, |
|
"loss": 0.8808, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.8969072164948453, |
|
"eval_loss": 1.016908049583435, |
|
"eval_runtime": 10.5685, |
|
"eval_samples_per_second": 11.071, |
|
"eval_steps_per_second": 0.757, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.9072164948453607, |
|
"grad_norm": 0.08013435453176498, |
|
"learning_rate": 0.00016245210727969348, |
|
"loss": 0.8529, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.9072164948453607, |
|
"eval_loss": 1.0165438652038574, |
|
"eval_runtime": 10.5233, |
|
"eval_samples_per_second": 11.118, |
|
"eval_steps_per_second": 0.76, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.9175257731958761, |
|
"grad_norm": 0.08760453760623932, |
|
"learning_rate": 0.00016091954022988506, |
|
"loss": 0.83, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.9175257731958761, |
|
"eval_loss": 1.016537070274353, |
|
"eval_runtime": 10.4563, |
|
"eval_samples_per_second": 11.189, |
|
"eval_steps_per_second": 0.765, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.9278350515463918, |
|
"grad_norm": 0.08273079246282578, |
|
"learning_rate": 0.00015938697318007664, |
|
"loss": 0.7931, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.9278350515463918, |
|
"eval_loss": 1.0166338682174683, |
|
"eval_runtime": 10.5013, |
|
"eval_samples_per_second": 11.141, |
|
"eval_steps_per_second": 0.762, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.9381443298969072, |
|
"grad_norm": 0.0865551233291626, |
|
"learning_rate": 0.00015785440613026822, |
|
"loss": 0.815, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.9381443298969072, |
|
"eval_loss": 1.0168683528900146, |
|
"eval_runtime": 10.4813, |
|
"eval_samples_per_second": 11.163, |
|
"eval_steps_per_second": 0.763, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.9484536082474226, |
|
"grad_norm": 0.08994690328836441, |
|
"learning_rate": 0.0001563218390804598, |
|
"loss": 0.8336, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.9484536082474226, |
|
"eval_loss": 1.0170217752456665, |
|
"eval_runtime": 10.5315, |
|
"eval_samples_per_second": 11.11, |
|
"eval_steps_per_second": 0.76, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.9587628865979383, |
|
"grad_norm": 0.09116367995738983, |
|
"learning_rate": 0.00015478927203065135, |
|
"loss": 0.7624, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.9587628865979383, |
|
"eval_loss": 1.0173101425170898, |
|
"eval_runtime": 10.4893, |
|
"eval_samples_per_second": 11.154, |
|
"eval_steps_per_second": 0.763, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.9690721649484537, |
|
"grad_norm": 0.07905647903680801, |
|
"learning_rate": 0.00015325670498084293, |
|
"loss": 0.8093, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.9690721649484537, |
|
"eval_loss": 1.0177738666534424, |
|
"eval_runtime": 10.5444, |
|
"eval_samples_per_second": 11.096, |
|
"eval_steps_per_second": 0.759, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.9793814432989691, |
|
"grad_norm": 0.08135747909545898, |
|
"learning_rate": 0.00015172413793103449, |
|
"loss": 0.8166, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.9793814432989691, |
|
"eval_loss": 1.018350601196289, |
|
"eval_runtime": 10.5429, |
|
"eval_samples_per_second": 11.097, |
|
"eval_steps_per_second": 0.759, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.9896907216494846, |
|
"grad_norm": 0.08733044564723969, |
|
"learning_rate": 0.00015019157088122607, |
|
"loss": 0.7852, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.9896907216494846, |
|
"eval_loss": 1.0188955068588257, |
|
"eval_runtime": 10.5234, |
|
"eval_samples_per_second": 11.118, |
|
"eval_steps_per_second": 0.76, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.14572270214557648, |
|
"learning_rate": 0.00014865900383141765, |
|
"loss": 0.9301, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.019044280052185, |
|
"eval_runtime": 10.459, |
|
"eval_samples_per_second": 11.187, |
|
"eval_steps_per_second": 0.765, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.0103092783505154, |
|
"grad_norm": 0.0840727686882019, |
|
"learning_rate": 0.0001471264367816092, |
|
"loss": 0.7462, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.0103092783505154, |
|
"eval_loss": 1.0192935466766357, |
|
"eval_runtime": 10.486, |
|
"eval_samples_per_second": 11.158, |
|
"eval_steps_per_second": 0.763, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.020618556701031, |
|
"grad_norm": 0.08838547766208649, |
|
"learning_rate": 0.00014559386973180078, |
|
"loss": 0.8009, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.020618556701031, |
|
"eval_loss": 1.0194281339645386, |
|
"eval_runtime": 10.4944, |
|
"eval_samples_per_second": 11.149, |
|
"eval_steps_per_second": 0.762, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.0309278350515463, |
|
"grad_norm": 0.07864759117364883, |
|
"learning_rate": 0.00014406130268199233, |
|
"loss": 0.7532, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.0309278350515463, |
|
"eval_loss": 1.0197179317474365, |
|
"eval_runtime": 10.496, |
|
"eval_samples_per_second": 11.147, |
|
"eval_steps_per_second": 0.762, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.0412371134020617, |
|
"grad_norm": 0.09053569287061691, |
|
"learning_rate": 0.0001425287356321839, |
|
"loss": 0.7911, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.0412371134020617, |
|
"eval_loss": 1.0197087526321411, |
|
"eval_runtime": 10.5162, |
|
"eval_samples_per_second": 11.126, |
|
"eval_steps_per_second": 0.761, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.051546391752577, |
|
"grad_norm": 0.11090534925460815, |
|
"learning_rate": 0.00014099616858237547, |
|
"loss": 0.7683, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.051546391752577, |
|
"eval_loss": 1.0192407369613647, |
|
"eval_runtime": 10.556, |
|
"eval_samples_per_second": 11.084, |
|
"eval_steps_per_second": 0.758, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.0618556701030926, |
|
"grad_norm": 0.08406960964202881, |
|
"learning_rate": 0.00013946360153256705, |
|
"loss": 0.7608, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.0618556701030926, |
|
"eval_loss": 1.0188153982162476, |
|
"eval_runtime": 10.559, |
|
"eval_samples_per_second": 11.081, |
|
"eval_steps_per_second": 0.758, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.0721649484536084, |
|
"grad_norm": 0.08882401883602142, |
|
"learning_rate": 0.00013793103448275863, |
|
"loss": 0.8186, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.0721649484536084, |
|
"eval_loss": 1.018549919128418, |
|
"eval_runtime": 10.4948, |
|
"eval_samples_per_second": 11.148, |
|
"eval_steps_per_second": 0.762, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.082474226804124, |
|
"grad_norm": 0.08813253790140152, |
|
"learning_rate": 0.0001363984674329502, |
|
"loss": 0.8012, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.082474226804124, |
|
"eval_loss": 1.0187878608703613, |
|
"eval_runtime": 10.5427, |
|
"eval_samples_per_second": 11.098, |
|
"eval_steps_per_second": 0.759, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.0927835051546393, |
|
"grad_norm": 0.0944976881146431, |
|
"learning_rate": 0.0001348659003831418, |
|
"loss": 0.7772, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.0927835051546393, |
|
"eval_loss": 1.0195982456207275, |
|
"eval_runtime": 10.5075, |
|
"eval_samples_per_second": 11.135, |
|
"eval_steps_per_second": 0.761, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.1030927835051547, |
|
"grad_norm": 0.09477493166923523, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.7637, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.1030927835051547, |
|
"eval_loss": 1.0205055475234985, |
|
"eval_runtime": 10.5562, |
|
"eval_samples_per_second": 11.084, |
|
"eval_steps_per_second": 0.758, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.11340206185567, |
|
"grad_norm": 0.09403283894062042, |
|
"learning_rate": 0.00013180076628352492, |
|
"loss": 0.772, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.11340206185567, |
|
"eval_loss": 1.0212863683700562, |
|
"eval_runtime": 10.5507, |
|
"eval_samples_per_second": 11.089, |
|
"eval_steps_per_second": 0.758, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.1237113402061856, |
|
"grad_norm": 0.09774558991193771, |
|
"learning_rate": 0.00013026819923371647, |
|
"loss": 0.7282, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.1237113402061856, |
|
"eval_loss": 1.022179126739502, |
|
"eval_runtime": 10.4895, |
|
"eval_samples_per_second": 11.154, |
|
"eval_steps_per_second": 0.763, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.134020618556701, |
|
"grad_norm": 0.09915328025817871, |
|
"learning_rate": 0.00012873563218390805, |
|
"loss": 0.8647, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.134020618556701, |
|
"eval_loss": 1.0221599340438843, |
|
"eval_runtime": 10.3742, |
|
"eval_samples_per_second": 11.278, |
|
"eval_steps_per_second": 0.771, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.1443298969072164, |
|
"grad_norm": 0.10348186641931534, |
|
"learning_rate": 0.00012720306513409963, |
|
"loss": 0.817, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.1443298969072164, |
|
"eval_loss": 1.021559715270996, |
|
"eval_runtime": 10.5439, |
|
"eval_samples_per_second": 11.097, |
|
"eval_steps_per_second": 0.759, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.154639175257732, |
|
"grad_norm": 0.0915810763835907, |
|
"learning_rate": 0.00012567049808429119, |
|
"loss": 0.7587, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.154639175257732, |
|
"eval_loss": 1.020798921585083, |
|
"eval_runtime": 10.4998, |
|
"eval_samples_per_second": 11.143, |
|
"eval_steps_per_second": 0.762, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.1649484536082473, |
|
"grad_norm": 0.09581980854272842, |
|
"learning_rate": 0.00012413793103448277, |
|
"loss": 0.7774, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.1649484536082473, |
|
"eval_loss": 1.0200207233428955, |
|
"eval_runtime": 10.4597, |
|
"eval_samples_per_second": 11.186, |
|
"eval_steps_per_second": 0.765, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.1752577319587627, |
|
"grad_norm": 0.08651889115571976, |
|
"learning_rate": 0.00012260536398467432, |
|
"loss": 0.8203, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.1752577319587627, |
|
"eval_loss": 1.0191607475280762, |
|
"eval_runtime": 10.5201, |
|
"eval_samples_per_second": 11.122, |
|
"eval_steps_per_second": 0.76, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.1855670103092786, |
|
"grad_norm": 0.11074655503034592, |
|
"learning_rate": 0.0001210727969348659, |
|
"loss": 0.7586, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.1855670103092786, |
|
"eval_loss": 1.0188895463943481, |
|
"eval_runtime": 10.4748, |
|
"eval_samples_per_second": 11.17, |
|
"eval_steps_per_second": 0.764, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.195876288659794, |
|
"grad_norm": 0.08488008379936218, |
|
"learning_rate": 0.00011954022988505748, |
|
"loss": 0.8541, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.195876288659794, |
|
"eval_loss": 1.0185297727584839, |
|
"eval_runtime": 10.4972, |
|
"eval_samples_per_second": 11.146, |
|
"eval_steps_per_second": 0.762, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.2061855670103094, |
|
"grad_norm": 0.0965062826871872, |
|
"learning_rate": 0.00011800766283524905, |
|
"loss": 0.7906, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.2061855670103094, |
|
"eval_loss": 1.018255352973938, |
|
"eval_runtime": 10.5601, |
|
"eval_samples_per_second": 11.079, |
|
"eval_steps_per_second": 0.758, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.216494845360825, |
|
"grad_norm": 0.08887593448162079, |
|
"learning_rate": 0.00011647509578544063, |
|
"loss": 0.8241, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.216494845360825, |
|
"eval_loss": 1.0182160139083862, |
|
"eval_runtime": 10.4248, |
|
"eval_samples_per_second": 11.223, |
|
"eval_steps_per_second": 0.767, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.2268041237113403, |
|
"grad_norm": 0.10638058930635452, |
|
"learning_rate": 0.00011494252873563218, |
|
"loss": 0.7893, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.2268041237113403, |
|
"eval_loss": 1.018141269683838, |
|
"eval_runtime": 10.518, |
|
"eval_samples_per_second": 11.124, |
|
"eval_steps_per_second": 0.761, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.2371134020618557, |
|
"grad_norm": 0.09599471092224121, |
|
"learning_rate": 0.00011340996168582376, |
|
"loss": 0.7547, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.2371134020618557, |
|
"eval_loss": 1.0182616710662842, |
|
"eval_runtime": 10.5441, |
|
"eval_samples_per_second": 11.096, |
|
"eval_steps_per_second": 0.759, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.247422680412371, |
|
"grad_norm": 0.0928688496351242, |
|
"learning_rate": 0.00011187739463601533, |
|
"loss": 0.778, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.247422680412371, |
|
"eval_loss": 1.0184054374694824, |
|
"eval_runtime": 10.4377, |
|
"eval_samples_per_second": 11.209, |
|
"eval_steps_per_second": 0.766, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.2577319587628866, |
|
"grad_norm": 0.09548985213041306, |
|
"learning_rate": 0.0001103448275862069, |
|
"loss": 0.8079, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.2577319587628866, |
|
"eval_loss": 1.0186175107955933, |
|
"eval_runtime": 10.4148, |
|
"eval_samples_per_second": 11.234, |
|
"eval_steps_per_second": 0.768, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.268041237113402, |
|
"grad_norm": 0.08929029107093811, |
|
"learning_rate": 0.00010881226053639849, |
|
"loss": 0.8481, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.268041237113402, |
|
"eval_loss": 1.0190272331237793, |
|
"eval_runtime": 10.6001, |
|
"eval_samples_per_second": 11.038, |
|
"eval_steps_per_second": 0.755, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.2783505154639174, |
|
"grad_norm": 0.10469140112400055, |
|
"learning_rate": 0.00010727969348659004, |
|
"loss": 0.7715, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.2783505154639174, |
|
"eval_loss": 1.019405484199524, |
|
"eval_runtime": 10.4972, |
|
"eval_samples_per_second": 11.146, |
|
"eval_steps_per_second": 0.762, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.288659793814433, |
|
"grad_norm": 0.10432987660169601, |
|
"learning_rate": 0.00010574712643678162, |
|
"loss": 0.7934, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.288659793814433, |
|
"eval_loss": 1.0200779438018799, |
|
"eval_runtime": 10.584, |
|
"eval_samples_per_second": 11.054, |
|
"eval_steps_per_second": 0.756, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.2989690721649483, |
|
"grad_norm": 0.10107388347387314, |
|
"learning_rate": 0.00010421455938697317, |
|
"loss": 0.7823, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.2989690721649483, |
|
"eval_loss": 1.0203680992126465, |
|
"eval_runtime": 10.4386, |
|
"eval_samples_per_second": 11.208, |
|
"eval_steps_per_second": 0.766, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.3092783505154637, |
|
"grad_norm": 0.1003124788403511, |
|
"learning_rate": 0.00010268199233716475, |
|
"loss": 0.7986, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.3092783505154637, |
|
"eval_loss": 1.020281434059143, |
|
"eval_runtime": 10.5342, |
|
"eval_samples_per_second": 11.107, |
|
"eval_steps_per_second": 0.759, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.319587628865979, |
|
"grad_norm": 0.10325206816196442, |
|
"learning_rate": 0.00010114942528735633, |
|
"loss": 0.7965, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.319587628865979, |
|
"eval_loss": 1.0199090242385864, |
|
"eval_runtime": 10.5283, |
|
"eval_samples_per_second": 11.113, |
|
"eval_steps_per_second": 0.76, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.329896907216495, |
|
"grad_norm": 0.11150746047496796, |
|
"learning_rate": 9.96168582375479e-05, |
|
"loss": 0.8125, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.329896907216495, |
|
"eval_loss": 1.0192387104034424, |
|
"eval_runtime": 10.4936, |
|
"eval_samples_per_second": 11.15, |
|
"eval_steps_per_second": 0.762, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.3402061855670104, |
|
"grad_norm": 0.10894080251455307, |
|
"learning_rate": 9.808429118773947e-05, |
|
"loss": 0.7532, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.3402061855670104, |
|
"eval_loss": 1.018675684928894, |
|
"eval_runtime": 10.5342, |
|
"eval_samples_per_second": 11.107, |
|
"eval_steps_per_second": 0.759, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.350515463917526, |
|
"grad_norm": 0.1041807159781456, |
|
"learning_rate": 9.655172413793105e-05, |
|
"loss": 0.8406, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.350515463917526, |
|
"eval_loss": 1.018149495124817, |
|
"eval_runtime": 10.5998, |
|
"eval_samples_per_second": 11.038, |
|
"eval_steps_per_second": 0.755, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.3608247422680413, |
|
"grad_norm": 0.10462289303541183, |
|
"learning_rate": 9.501915708812261e-05, |
|
"loss": 0.7888, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.3608247422680413, |
|
"eval_loss": 1.017836570739746, |
|
"eval_runtime": 10.5236, |
|
"eval_samples_per_second": 11.118, |
|
"eval_steps_per_second": 0.76, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.3711340206185567, |
|
"grad_norm": 0.10396596044301987, |
|
"learning_rate": 9.348659003831418e-05, |
|
"loss": 0.7866, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.3711340206185567, |
|
"eval_loss": 1.0178693532943726, |
|
"eval_runtime": 10.56, |
|
"eval_samples_per_second": 11.079, |
|
"eval_steps_per_second": 0.758, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.381443298969072, |
|
"grad_norm": 0.11310282349586487, |
|
"learning_rate": 9.195402298850575e-05, |
|
"loss": 0.7927, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.381443298969072, |
|
"eval_loss": 1.0181832313537598, |
|
"eval_runtime": 10.5406, |
|
"eval_samples_per_second": 11.1, |
|
"eval_steps_per_second": 0.759, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.3917525773195876, |
|
"grad_norm": 0.10537407547235489, |
|
"learning_rate": 9.042145593869731e-05, |
|
"loss": 0.7635, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.3917525773195876, |
|
"eval_loss": 1.018680453300476, |
|
"eval_runtime": 10.4507, |
|
"eval_samples_per_second": 11.195, |
|
"eval_steps_per_second": 0.765, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.402061855670103, |
|
"grad_norm": 0.10149942338466644, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.809, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.402061855670103, |
|
"eval_loss": 1.0191614627838135, |
|
"eval_runtime": 10.4475, |
|
"eval_samples_per_second": 11.199, |
|
"eval_steps_per_second": 0.766, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.4123711340206184, |
|
"grad_norm": 0.11062859743833542, |
|
"learning_rate": 8.735632183908047e-05, |
|
"loss": 0.7922, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.4123711340206184, |
|
"eval_loss": 1.0194060802459717, |
|
"eval_runtime": 10.4332, |
|
"eval_samples_per_second": 11.214, |
|
"eval_steps_per_second": 0.767, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.422680412371134, |
|
"grad_norm": 0.11534405499696732, |
|
"learning_rate": 8.582375478927204e-05, |
|
"loss": 0.8329, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.422680412371134, |
|
"eval_loss": 1.0191195011138916, |
|
"eval_runtime": 10.5353, |
|
"eval_samples_per_second": 11.105, |
|
"eval_steps_per_second": 0.759, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.4329896907216497, |
|
"grad_norm": 0.13866928219795227, |
|
"learning_rate": 8.42911877394636e-05, |
|
"loss": 0.822, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.4329896907216497, |
|
"eval_loss": 1.0185078382492065, |
|
"eval_runtime": 10.4759, |
|
"eval_samples_per_second": 11.168, |
|
"eval_steps_per_second": 0.764, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.443298969072165, |
|
"grad_norm": 0.10848218202590942, |
|
"learning_rate": 8.275862068965517e-05, |
|
"loss": 0.7679, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.443298969072165, |
|
"eval_loss": 1.0180175304412842, |
|
"eval_runtime": 10.5752, |
|
"eval_samples_per_second": 11.064, |
|
"eval_steps_per_second": 0.756, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.4536082474226806, |
|
"grad_norm": 0.10282213240861893, |
|
"learning_rate": 8.122605363984674e-05, |
|
"loss": 0.8445, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.4536082474226806, |
|
"eval_loss": 1.0175271034240723, |
|
"eval_runtime": 10.5637, |
|
"eval_samples_per_second": 11.076, |
|
"eval_steps_per_second": 0.757, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.463917525773196, |
|
"grad_norm": 0.10641578584909439, |
|
"learning_rate": 7.969348659003832e-05, |
|
"loss": 0.7808, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.463917525773196, |
|
"eval_loss": 1.017316460609436, |
|
"eval_runtime": 10.4736, |
|
"eval_samples_per_second": 11.171, |
|
"eval_steps_per_second": 0.764, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.4742268041237114, |
|
"grad_norm": 0.10965276509523392, |
|
"learning_rate": 7.81609195402299e-05, |
|
"loss": 0.8193, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.4742268041237114, |
|
"eval_loss": 1.017372965812683, |
|
"eval_runtime": 10.5979, |
|
"eval_samples_per_second": 11.04, |
|
"eval_steps_per_second": 0.755, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.484536082474227, |
|
"grad_norm": 0.09745683521032333, |
|
"learning_rate": 7.662835249042147e-05, |
|
"loss": 0.7624, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.484536082474227, |
|
"eval_loss": 1.017683982849121, |
|
"eval_runtime": 10.5349, |
|
"eval_samples_per_second": 11.106, |
|
"eval_steps_per_second": 0.759, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.4948453608247423, |
|
"grad_norm": 0.11108940094709396, |
|
"learning_rate": 7.509578544061303e-05, |
|
"loss": 0.7619, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.4948453608247423, |
|
"eval_loss": 1.0181565284729004, |
|
"eval_runtime": 10.5099, |
|
"eval_samples_per_second": 11.132, |
|
"eval_steps_per_second": 0.761, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.5051546391752577, |
|
"grad_norm": 0.10701552778482437, |
|
"learning_rate": 7.35632183908046e-05, |
|
"loss": 0.7666, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.5051546391752577, |
|
"eval_loss": 1.0187397003173828, |
|
"eval_runtime": 10.4726, |
|
"eval_samples_per_second": 11.172, |
|
"eval_steps_per_second": 0.764, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.515463917525773, |
|
"grad_norm": 0.11055250465869904, |
|
"learning_rate": 7.203065134099617e-05, |
|
"loss": 0.7969, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.515463917525773, |
|
"eval_loss": 1.0191231966018677, |
|
"eval_runtime": 10.5692, |
|
"eval_samples_per_second": 11.07, |
|
"eval_steps_per_second": 0.757, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.5257731958762886, |
|
"grad_norm": 0.11214307695627213, |
|
"learning_rate": 7.049808429118773e-05, |
|
"loss": 0.7839, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.5257731958762886, |
|
"eval_loss": 1.0191996097564697, |
|
"eval_runtime": 10.5093, |
|
"eval_samples_per_second": 11.133, |
|
"eval_steps_per_second": 0.761, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.536082474226804, |
|
"grad_norm": 0.13082517683506012, |
|
"learning_rate": 6.896551724137931e-05, |
|
"loss": 0.819, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.536082474226804, |
|
"eval_loss": 1.0188630819320679, |
|
"eval_runtime": 10.5203, |
|
"eval_samples_per_second": 11.121, |
|
"eval_steps_per_second": 0.76, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.5463917525773194, |
|
"grad_norm": 0.1120811253786087, |
|
"learning_rate": 6.74329501915709e-05, |
|
"loss": 0.7764, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.5463917525773194, |
|
"eval_loss": 1.018404245376587, |
|
"eval_runtime": 10.5867, |
|
"eval_samples_per_second": 11.052, |
|
"eval_steps_per_second": 0.756, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.556701030927835, |
|
"grad_norm": 0.09575408697128296, |
|
"learning_rate": 6.590038314176246e-05, |
|
"loss": 0.7836, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.556701030927835, |
|
"eval_loss": 1.0178431272506714, |
|
"eval_runtime": 10.4988, |
|
"eval_samples_per_second": 11.144, |
|
"eval_steps_per_second": 0.762, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.5670103092783503, |
|
"grad_norm": 0.10654096305370331, |
|
"learning_rate": 6.436781609195403e-05, |
|
"loss": 0.7133, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.5670103092783503, |
|
"eval_loss": 1.017632007598877, |
|
"eval_runtime": 10.5716, |
|
"eval_samples_per_second": 11.067, |
|
"eval_steps_per_second": 0.757, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.5773195876288657, |
|
"grad_norm": 0.11376085132360458, |
|
"learning_rate": 6.283524904214559e-05, |
|
"loss": 0.7909, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.5773195876288657, |
|
"eval_loss": 1.017664909362793, |
|
"eval_runtime": 10.5556, |
|
"eval_samples_per_second": 11.084, |
|
"eval_steps_per_second": 0.758, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.5876288659793816, |
|
"grad_norm": 0.10257536172866821, |
|
"learning_rate": 6.130268199233716e-05, |
|
"loss": 0.807, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.5876288659793816, |
|
"eval_loss": 1.0176854133605957, |
|
"eval_runtime": 10.5392, |
|
"eval_samples_per_second": 11.101, |
|
"eval_steps_per_second": 0.759, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.597938144329897, |
|
"grad_norm": 0.117995485663414, |
|
"learning_rate": 5.977011494252874e-05, |
|
"loss": 0.7885, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.597938144329897, |
|
"eval_loss": 1.0178779363632202, |
|
"eval_runtime": 10.4833, |
|
"eval_samples_per_second": 11.161, |
|
"eval_steps_per_second": 0.763, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.6082474226804124, |
|
"grad_norm": 0.10343543440103531, |
|
"learning_rate": 5.823754789272031e-05, |
|
"loss": 0.8099, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.6082474226804124, |
|
"eval_loss": 1.0179591178894043, |
|
"eval_runtime": 10.5098, |
|
"eval_samples_per_second": 11.133, |
|
"eval_steps_per_second": 0.761, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.618556701030928, |
|
"grad_norm": 0.10097340494394302, |
|
"learning_rate": 5.670498084291188e-05, |
|
"loss": 0.7874, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.618556701030928, |
|
"eval_loss": 1.0180598497390747, |
|
"eval_runtime": 10.4909, |
|
"eval_samples_per_second": 11.153, |
|
"eval_steps_per_second": 0.763, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.6288659793814433, |
|
"grad_norm": 0.11207850277423859, |
|
"learning_rate": 5.517241379310345e-05, |
|
"loss": 0.7854, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.6288659793814433, |
|
"eval_loss": 1.0180803537368774, |
|
"eval_runtime": 10.4616, |
|
"eval_samples_per_second": 11.184, |
|
"eval_steps_per_second": 0.765, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.6391752577319587, |
|
"grad_norm": 0.12792158126831055, |
|
"learning_rate": 5.363984674329502e-05, |
|
"loss": 0.767, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.6391752577319587, |
|
"eval_loss": 1.017929196357727, |
|
"eval_runtime": 10.4878, |
|
"eval_samples_per_second": 11.156, |
|
"eval_steps_per_second": 0.763, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.649484536082474, |
|
"grad_norm": 0.12841103971004486, |
|
"learning_rate": 5.2107279693486586e-05, |
|
"loss": 0.7813, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.649484536082474, |
|
"eval_loss": 1.0177621841430664, |
|
"eval_runtime": 10.5109, |
|
"eval_samples_per_second": 11.131, |
|
"eval_steps_per_second": 0.761, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.6597938144329896, |
|
"grad_norm": 0.10872387886047363, |
|
"learning_rate": 5.057471264367817e-05, |
|
"loss": 0.8059, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.6597938144329896, |
|
"eval_loss": 1.0175319910049438, |
|
"eval_runtime": 10.5047, |
|
"eval_samples_per_second": 11.138, |
|
"eval_steps_per_second": 0.762, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.670103092783505, |
|
"grad_norm": 0.11696380376815796, |
|
"learning_rate": 4.904214559386973e-05, |
|
"loss": 0.8015, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.670103092783505, |
|
"eval_loss": 1.0173434019088745, |
|
"eval_runtime": 10.6033, |
|
"eval_samples_per_second": 11.034, |
|
"eval_steps_per_second": 0.754, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.680412371134021, |
|
"grad_norm": 0.10109103471040726, |
|
"learning_rate": 4.7509578544061307e-05, |
|
"loss": 0.7709, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.680412371134021, |
|
"eval_loss": 1.0171841382980347, |
|
"eval_runtime": 10.5321, |
|
"eval_samples_per_second": 11.109, |
|
"eval_steps_per_second": 0.76, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.6907216494845363, |
|
"grad_norm": 0.10908724367618561, |
|
"learning_rate": 4.597701149425287e-05, |
|
"loss": 0.8385, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.6907216494845363, |
|
"eval_loss": 1.0171395540237427, |
|
"eval_runtime": 10.5094, |
|
"eval_samples_per_second": 11.133, |
|
"eval_steps_per_second": 0.761, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.7010309278350517, |
|
"grad_norm": 0.1192815825343132, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.7741, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.7010309278350517, |
|
"eval_loss": 1.017322063446045, |
|
"eval_runtime": 10.471, |
|
"eval_samples_per_second": 11.174, |
|
"eval_steps_per_second": 0.764, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.711340206185567, |
|
"grad_norm": 0.1109393984079361, |
|
"learning_rate": 4.291187739463602e-05, |
|
"loss": 0.788, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.711340206185567, |
|
"eval_loss": 1.0174421072006226, |
|
"eval_runtime": 10.5644, |
|
"eval_samples_per_second": 11.075, |
|
"eval_steps_per_second": 0.757, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.7216494845360826, |
|
"grad_norm": 0.11348064243793488, |
|
"learning_rate": 4.1379310344827587e-05, |
|
"loss": 0.8168, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.7216494845360826, |
|
"eval_loss": 1.0176218748092651, |
|
"eval_runtime": 10.4442, |
|
"eval_samples_per_second": 11.202, |
|
"eval_steps_per_second": 0.766, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.731958762886598, |
|
"grad_norm": 0.1007143035531044, |
|
"learning_rate": 3.984674329501916e-05, |
|
"loss": 0.8103, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.731958762886598, |
|
"eval_loss": 1.0178673267364502, |
|
"eval_runtime": 10.543, |
|
"eval_samples_per_second": 11.097, |
|
"eval_steps_per_second": 0.759, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.7422680412371134, |
|
"grad_norm": 0.1139923632144928, |
|
"learning_rate": 3.831417624521073e-05, |
|
"loss": 0.7874, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.7422680412371134, |
|
"eval_loss": 1.0180341005325317, |
|
"eval_runtime": 10.5642, |
|
"eval_samples_per_second": 11.075, |
|
"eval_steps_per_second": 0.757, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.752577319587629, |
|
"grad_norm": 0.1035841554403305, |
|
"learning_rate": 3.67816091954023e-05, |
|
"loss": 0.783, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.752577319587629, |
|
"eval_loss": 1.0181059837341309, |
|
"eval_runtime": 10.4791, |
|
"eval_samples_per_second": 11.165, |
|
"eval_steps_per_second": 0.763, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.7628865979381443, |
|
"grad_norm": 0.10244117677211761, |
|
"learning_rate": 3.5249042145593867e-05, |
|
"loss": 0.788, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.7628865979381443, |
|
"eval_loss": 1.0181044340133667, |
|
"eval_runtime": 10.4053, |
|
"eval_samples_per_second": 11.244, |
|
"eval_steps_per_second": 0.769, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.7731958762886597, |
|
"grad_norm": 0.11186489462852478, |
|
"learning_rate": 3.371647509578545e-05, |
|
"loss": 0.7462, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.7731958762886597, |
|
"eval_loss": 1.017915964126587, |
|
"eval_runtime": 10.4654, |
|
"eval_samples_per_second": 11.18, |
|
"eval_steps_per_second": 0.764, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.783505154639175, |
|
"grad_norm": 0.11139753460884094, |
|
"learning_rate": 3.218390804597701e-05, |
|
"loss": 0.7455, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.783505154639175, |
|
"eval_loss": 1.0177446603775024, |
|
"eval_runtime": 10.4353, |
|
"eval_samples_per_second": 11.212, |
|
"eval_steps_per_second": 0.767, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.7938144329896906, |
|
"grad_norm": 0.11244425177574158, |
|
"learning_rate": 3.065134099616858e-05, |
|
"loss": 0.7122, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.7938144329896906, |
|
"eval_loss": 1.017569661140442, |
|
"eval_runtime": 10.5363, |
|
"eval_samples_per_second": 11.104, |
|
"eval_steps_per_second": 0.759, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.804123711340206, |
|
"grad_norm": 0.10829540342092514, |
|
"learning_rate": 2.9118773946360157e-05, |
|
"loss": 0.7723, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.804123711340206, |
|
"eval_loss": 1.0173259973526, |
|
"eval_runtime": 10.4321, |
|
"eval_samples_per_second": 11.215, |
|
"eval_steps_per_second": 0.767, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.8144329896907214, |
|
"grad_norm": 0.11467045545578003, |
|
"learning_rate": 2.7586206896551727e-05, |
|
"loss": 0.7949, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.8144329896907214, |
|
"eval_loss": 1.0169979333877563, |
|
"eval_runtime": 10.5217, |
|
"eval_samples_per_second": 11.12, |
|
"eval_steps_per_second": 0.76, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.824742268041237, |
|
"grad_norm": 0.11706111580133438, |
|
"learning_rate": 2.6053639846743293e-05, |
|
"loss": 0.7676, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.824742268041237, |
|
"eval_loss": 1.0167900323867798, |
|
"eval_runtime": 10.4696, |
|
"eval_samples_per_second": 11.175, |
|
"eval_steps_per_second": 0.764, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.8350515463917527, |
|
"grad_norm": 0.10488202422857285, |
|
"learning_rate": 2.4521072796934867e-05, |
|
"loss": 0.8101, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.8350515463917527, |
|
"eval_loss": 1.0166265964508057, |
|
"eval_runtime": 10.5807, |
|
"eval_samples_per_second": 11.058, |
|
"eval_steps_per_second": 0.756, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.845360824742268, |
|
"grad_norm": 0.09602247178554535, |
|
"learning_rate": 2.2988505747126437e-05, |
|
"loss": 0.8105, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.845360824742268, |
|
"eval_loss": 1.0164811611175537, |
|
"eval_runtime": 10.5306, |
|
"eval_samples_per_second": 11.111, |
|
"eval_steps_per_second": 0.76, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.8556701030927836, |
|
"grad_norm": 0.10788743197917938, |
|
"learning_rate": 2.145593869731801e-05, |
|
"loss": 0.8147, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.8556701030927836, |
|
"eval_loss": 1.0163748264312744, |
|
"eval_runtime": 10.46, |
|
"eval_samples_per_second": 11.186, |
|
"eval_steps_per_second": 0.765, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.865979381443299, |
|
"grad_norm": 0.1024608314037323, |
|
"learning_rate": 1.992337164750958e-05, |
|
"loss": 0.8111, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.865979381443299, |
|
"eval_loss": 1.0162334442138672, |
|
"eval_runtime": 10.4655, |
|
"eval_samples_per_second": 11.18, |
|
"eval_steps_per_second": 0.764, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.8762886597938144, |
|
"grad_norm": 0.11023057997226715, |
|
"learning_rate": 1.839080459770115e-05, |
|
"loss": 0.7759, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.8762886597938144, |
|
"eval_loss": 1.0161850452423096, |
|
"eval_runtime": 10.5199, |
|
"eval_samples_per_second": 11.122, |
|
"eval_steps_per_second": 0.76, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.88659793814433, |
|
"grad_norm": 0.10540882498025894, |
|
"learning_rate": 1.6858237547892723e-05, |
|
"loss": 0.8403, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.88659793814433, |
|
"eval_loss": 1.0160962343215942, |
|
"eval_runtime": 10.5013, |
|
"eval_samples_per_second": 11.141, |
|
"eval_steps_per_second": 0.762, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.8969072164948453, |
|
"grad_norm": 0.10188038647174835, |
|
"learning_rate": 1.532567049808429e-05, |
|
"loss": 0.8001, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.8969072164948453, |
|
"eval_loss": 1.016011357307434, |
|
"eval_runtime": 10.5246, |
|
"eval_samples_per_second": 11.117, |
|
"eval_steps_per_second": 0.76, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.9072164948453607, |
|
"grad_norm": 0.1036670058965683, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 0.7354, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.9072164948453607, |
|
"eval_loss": 1.0159510374069214, |
|
"eval_runtime": 10.5046, |
|
"eval_samples_per_second": 11.138, |
|
"eval_steps_per_second": 0.762, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.917525773195876, |
|
"grad_norm": 0.10377184301614761, |
|
"learning_rate": 1.2260536398467433e-05, |
|
"loss": 0.7603, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.917525773195876, |
|
"eval_loss": 1.015906810760498, |
|
"eval_runtime": 10.5281, |
|
"eval_samples_per_second": 11.113, |
|
"eval_steps_per_second": 0.76, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.927835051546392, |
|
"grad_norm": 0.10949227213859558, |
|
"learning_rate": 1.0727969348659005e-05, |
|
"loss": 0.7424, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.927835051546392, |
|
"eval_loss": 1.015905499458313, |
|
"eval_runtime": 10.4748, |
|
"eval_samples_per_second": 11.17, |
|
"eval_steps_per_second": 0.764, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.9381443298969074, |
|
"grad_norm": 0.11446211487054825, |
|
"learning_rate": 9.195402298850575e-06, |
|
"loss": 0.8042, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.9381443298969074, |
|
"eval_loss": 1.0158897638320923, |
|
"eval_runtime": 10.4899, |
|
"eval_samples_per_second": 11.154, |
|
"eval_steps_per_second": 0.763, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.948453608247423, |
|
"grad_norm": 0.10691644251346588, |
|
"learning_rate": 7.662835249042145e-06, |
|
"loss": 0.7321, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.948453608247423, |
|
"eval_loss": 1.0159013271331787, |
|
"eval_runtime": 10.4448, |
|
"eval_samples_per_second": 11.202, |
|
"eval_steps_per_second": 0.766, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.9587628865979383, |
|
"grad_norm": 0.09944912046194077, |
|
"learning_rate": 6.130268199233717e-06, |
|
"loss": 0.7766, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.9587628865979383, |
|
"eval_loss": 1.015924334526062, |
|
"eval_runtime": 10.4962, |
|
"eval_samples_per_second": 11.147, |
|
"eval_steps_per_second": 0.762, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.9690721649484537, |
|
"grad_norm": 0.104118213057518, |
|
"learning_rate": 4.5977011494252875e-06, |
|
"loss": 0.7389, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.9690721649484537, |
|
"eval_loss": 1.0159478187561035, |
|
"eval_runtime": 10.535, |
|
"eval_samples_per_second": 11.106, |
|
"eval_steps_per_second": 0.759, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.979381443298969, |
|
"grad_norm": 0.117298923432827, |
|
"learning_rate": 3.0651340996168583e-06, |
|
"loss": 0.7534, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.979381443298969, |
|
"eval_loss": 1.0159544944763184, |
|
"eval_runtime": 10.4729, |
|
"eval_samples_per_second": 11.172, |
|
"eval_steps_per_second": 0.764, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.9896907216494846, |
|
"grad_norm": 0.1034502387046814, |
|
"learning_rate": 1.5325670498084292e-06, |
|
"loss": 0.7624, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.9896907216494846, |
|
"eval_loss": 1.0159491300582886, |
|
"eval_runtime": 10.4689, |
|
"eval_samples_per_second": 11.176, |
|
"eval_steps_per_second": 0.764, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.18611639738082886, |
|
"learning_rate": 0.0, |
|
"loss": 0.7573, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.0159794092178345, |
|
"eval_runtime": 10.4782, |
|
"eval_samples_per_second": 11.166, |
|
"eval_steps_per_second": 0.763, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 291, |
|
"total_flos": 1.0719943700093338e+17, |
|
"train_loss": 0.9805978040924597, |
|
"train_runtime": 5759.6176, |
|
"train_samples_per_second": 1.606, |
|
"train_steps_per_second": 0.051 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 291, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0719943700093338e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|