|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 372040, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.1385914087295532, |
|
"learning_rate": 3.125e-06, |
|
"loss": 7.5337, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8339954614639282, |
|
"learning_rate": 6.25e-06, |
|
"loss": 5.7974, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7770102024078369, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 5.3786, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8822804689407349, |
|
"learning_rate": 1.25e-05, |
|
"loss": 5.1546, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.9704174995422363, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 5.0064, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.011856198310852, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 4.8697, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.127143383026123, |
|
"learning_rate": 2.1875e-05, |
|
"loss": 4.7448, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.1060676574707031, |
|
"learning_rate": 2.5e-05, |
|
"loss": 4.6483, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.0898035764694214, |
|
"learning_rate": 2.8125000000000003e-05, |
|
"loss": 4.554, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0667864084243774, |
|
"learning_rate": 3.125e-05, |
|
"loss": 4.4709, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0186491012573242, |
|
"learning_rate": 3.4375e-05, |
|
"loss": 4.4155, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0487744808197021, |
|
"learning_rate": 3.7496875e-05, |
|
"loss": 4.3425, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0958940982818604, |
|
"learning_rate": 4.0621875e-05, |
|
"loss": 4.2827, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0781230926513672, |
|
"learning_rate": 4.374375e-05, |
|
"loss": 4.2321, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0080424547195435, |
|
"learning_rate": 4.686875e-05, |
|
"loss": 4.1839, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0729751586914062, |
|
"learning_rate": 4.999375e-05, |
|
"loss": 4.1402, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9728330373764038, |
|
"learning_rate": 5.311875000000001e-05, |
|
"loss": 4.0886, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.982369065284729, |
|
"learning_rate": 5.6240625e-05, |
|
"loss": 4.0517, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.30857265198935596, |
|
"eval_loss": 4.261654376983643, |
|
"eval_runtime": 152.7298, |
|
"eval_samples_per_second": 379.238, |
|
"eval_steps_per_second": 5.932, |
|
"step": 18602 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.9907498955726624, |
|
"learning_rate": 5.93625e-05, |
|
"loss": 4.0017, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.011263370513916, |
|
"learning_rate": 6.24875e-05, |
|
"loss": 3.9591, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.9945884346961975, |
|
"learning_rate": 6.56125e-05, |
|
"loss": 3.919, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.9863882064819336, |
|
"learning_rate": 6.8734375e-05, |
|
"loss": 3.8924, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.0095977783203125, |
|
"learning_rate": 7.185625e-05, |
|
"loss": 3.8609, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.9618180394172668, |
|
"learning_rate": 7.498125e-05, |
|
"loss": 3.8246, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.0228387117385864, |
|
"learning_rate": 7.8103125e-05, |
|
"loss": 3.7991, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.953283965587616, |
|
"learning_rate": 8.122500000000001e-05, |
|
"loss": 3.7693, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.013651967048645, |
|
"learning_rate": 8.435e-05, |
|
"loss": 3.7465, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.9061474800109863, |
|
"learning_rate": 8.7471875e-05, |
|
"loss": 3.7264, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.892005205154419, |
|
"learning_rate": 9.0596875e-05, |
|
"loss": 3.7033, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.8759484887123108, |
|
"learning_rate": 9.3721875e-05, |
|
"loss": 3.6875, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.8649481534957886, |
|
"learning_rate": 9.684375e-05, |
|
"loss": 3.6669, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.9080563187599182, |
|
"learning_rate": 9.996875e-05, |
|
"loss": 3.655, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.8319312930107117, |
|
"learning_rate": 9.970885778143747e-05, |
|
"loss": 3.6316, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.8593823909759521, |
|
"learning_rate": 9.941506881543349e-05, |
|
"loss": 3.618, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.8492996692657471, |
|
"learning_rate": 9.912098576638043e-05, |
|
"loss": 3.5981, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.8345481753349304, |
|
"learning_rate": 9.882690271732737e-05, |
|
"loss": 3.5809, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.92176753282547, |
|
"learning_rate": 9.853281966827432e-05, |
|
"loss": 3.5614, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.36166690750187913, |
|
"eval_loss": 3.7325448989868164, |
|
"eval_runtime": 154.634, |
|
"eval_samples_per_second": 374.568, |
|
"eval_steps_per_second": 5.859, |
|
"step": 37204 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.8371630907058716, |
|
"learning_rate": 9.823903070227034e-05, |
|
"loss": 3.5263, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.8102852702140808, |
|
"learning_rate": 9.794494765321727e-05, |
|
"loss": 3.5146, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.8165059089660645, |
|
"learning_rate": 9.765086460416421e-05, |
|
"loss": 3.504, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.8198731541633606, |
|
"learning_rate": 9.735707563816023e-05, |
|
"loss": 3.4944, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.8228951096534729, |
|
"learning_rate": 9.706299258910717e-05, |
|
"loss": 3.4795, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.8009123802185059, |
|
"learning_rate": 9.676890954005412e-05, |
|
"loss": 3.4731, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.7662414312362671, |
|
"learning_rate": 9.647512057405012e-05, |
|
"loss": 3.4634, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.7814858555793762, |
|
"learning_rate": 9.618103752499706e-05, |
|
"loss": 3.4628, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.7715092301368713, |
|
"learning_rate": 9.588695447594401e-05, |
|
"loss": 3.451, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.7964122891426086, |
|
"learning_rate": 9.559287142689096e-05, |
|
"loss": 3.4396, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.7903115153312683, |
|
"learning_rate": 9.529908246088695e-05, |
|
"loss": 3.4363, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.7684825658798218, |
|
"learning_rate": 9.50049994118339e-05, |
|
"loss": 3.4255, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.774431049823761, |
|
"learning_rate": 9.47112104458299e-05, |
|
"loss": 3.4233, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.7562047839164734, |
|
"learning_rate": 9.441712739677684e-05, |
|
"loss": 3.407, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.8174190521240234, |
|
"learning_rate": 9.412333843077286e-05, |
|
"loss": 3.4048, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.7771363258361816, |
|
"learning_rate": 9.382954946476885e-05, |
|
"loss": 3.3993, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.8073471188545227, |
|
"learning_rate": 9.35354664157158e-05, |
|
"loss": 3.3877, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.7597507834434509, |
|
"learning_rate": 9.324138336666275e-05, |
|
"loss": 3.3871, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.3793839146396705, |
|
"eval_loss": 3.5926387310028076, |
|
"eval_runtime": 154.4428, |
|
"eval_samples_per_second": 375.032, |
|
"eval_steps_per_second": 5.866, |
|
"step": 55806 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.7489961385726929, |
|
"learning_rate": 9.294730031760969e-05, |
|
"loss": 3.3712, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.7571219205856323, |
|
"learning_rate": 9.26535113516057e-05, |
|
"loss": 3.328, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.739839494228363, |
|
"learning_rate": 9.23597223856017e-05, |
|
"loss": 3.3289, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.7352610230445862, |
|
"learning_rate": 9.206563933654865e-05, |
|
"loss": 3.328, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.7837926745414734, |
|
"learning_rate": 9.17715562874956e-05, |
|
"loss": 3.3302, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.7600297927856445, |
|
"learning_rate": 9.147747323844255e-05, |
|
"loss": 3.321, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.7775222063064575, |
|
"learning_rate": 9.118368427243854e-05, |
|
"loss": 3.3186, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.7667526006698608, |
|
"learning_rate": 9.088960122338549e-05, |
|
"loss": 3.321, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.75556880235672, |
|
"learning_rate": 9.059551817433244e-05, |
|
"loss": 3.3154, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.7374812364578247, |
|
"learning_rate": 9.030172920832843e-05, |
|
"loss": 3.3126, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.7322445511817932, |
|
"learning_rate": 9.000764615927539e-05, |
|
"loss": 3.3114, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.7468699812889099, |
|
"learning_rate": 8.971385719327139e-05, |
|
"loss": 3.3054, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.7796970009803772, |
|
"learning_rate": 8.941977414421833e-05, |
|
"loss": 3.3025, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.7635445594787598, |
|
"learning_rate": 8.912598517821433e-05, |
|
"loss": 3.2961, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.7578097581863403, |
|
"learning_rate": 8.883190212916128e-05, |
|
"loss": 3.2999, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.7045617699623108, |
|
"learning_rate": 8.853781908010823e-05, |
|
"loss": 3.2919, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.7356869578361511, |
|
"learning_rate": 8.824373603105518e-05, |
|
"loss": 3.2901, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.7233588099479675, |
|
"learning_rate": 8.794994706505117e-05, |
|
"loss": 3.2872, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.7042175531387329, |
|
"learning_rate": 8.765615809904718e-05, |
|
"loss": 3.2873, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.38890029805120985, |
|
"eval_loss": 3.490251064300537, |
|
"eval_runtime": 153.9343, |
|
"eval_samples_per_second": 376.271, |
|
"eval_steps_per_second": 5.886, |
|
"step": 74408 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.7667938470840454, |
|
"learning_rate": 8.736207504999411e-05, |
|
"loss": 3.2467, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.7835637927055359, |
|
"learning_rate": 8.706799200094107e-05, |
|
"loss": 3.2311, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.773997962474823, |
|
"learning_rate": 8.677390895188802e-05, |
|
"loss": 3.2309, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.7710655331611633, |
|
"learning_rate": 8.648011998588402e-05, |
|
"loss": 3.2363, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.7279992699623108, |
|
"learning_rate": 8.618633101988002e-05, |
|
"loss": 3.232, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.7493019700050354, |
|
"learning_rate": 8.589224797082696e-05, |
|
"loss": 3.2336, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.7595520615577698, |
|
"learning_rate": 8.559816492177391e-05, |
|
"loss": 3.2314, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.7674165368080139, |
|
"learning_rate": 8.530408187272086e-05, |
|
"loss": 3.2277, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.7552059292793274, |
|
"learning_rate": 8.501029290671685e-05, |
|
"loss": 3.2294, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.7497634291648865, |
|
"learning_rate": 8.471650394071287e-05, |
|
"loss": 3.2297, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.7449929714202881, |
|
"learning_rate": 8.442242089165981e-05, |
|
"loss": 3.2281, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.7487680315971375, |
|
"learning_rate": 8.412833784260675e-05, |
|
"loss": 3.2247, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.7540663480758667, |
|
"learning_rate": 8.38342547935537e-05, |
|
"loss": 3.2283, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.7613685727119446, |
|
"learning_rate": 8.354046582754971e-05, |
|
"loss": 3.2239, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.7112619876861572, |
|
"learning_rate": 8.324638277849665e-05, |
|
"loss": 3.2241, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.7479850649833679, |
|
"learning_rate": 8.295229972944359e-05, |
|
"loss": 3.2193, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.7664503455162048, |
|
"learning_rate": 8.265821668039054e-05, |
|
"loss": 3.2191, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.7236024737358093, |
|
"learning_rate": 8.23641336313375e-05, |
|
"loss": 3.2178, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.7357888221740723, |
|
"learning_rate": 8.20703446653335e-05, |
|
"loss": 3.2166, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.39298697266241545, |
|
"eval_loss": 3.4704902172088623, |
|
"eval_runtime": 154.103, |
|
"eval_samples_per_second": 375.859, |
|
"eval_steps_per_second": 5.879, |
|
"step": 93010 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"grad_norm": 0.7487916946411133, |
|
"learning_rate": 8.177626161628044e-05, |
|
"loss": 3.1601, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 0.7451751232147217, |
|
"learning_rate": 8.148247265027644e-05, |
|
"loss": 3.1606, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 0.7442896962165833, |
|
"learning_rate": 8.118838960122339e-05, |
|
"loss": 3.1632, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"grad_norm": 0.9075034260749817, |
|
"learning_rate": 8.089430655217034e-05, |
|
"loss": 3.1711, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 0.7450466752052307, |
|
"learning_rate": 8.06002235031173e-05, |
|
"loss": 3.1694, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.752282977104187, |
|
"learning_rate": 8.030643453711328e-05, |
|
"loss": 3.1673, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.742364764213562, |
|
"learning_rate": 8.001235148806023e-05, |
|
"loss": 3.1734, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 0.7386059165000916, |
|
"learning_rate": 7.971826843900719e-05, |
|
"loss": 3.1743, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 0.7800270915031433, |
|
"learning_rate": 7.942447947300318e-05, |
|
"loss": 3.1678, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.7261184453964233, |
|
"learning_rate": 7.913039642395013e-05, |
|
"loss": 3.1706, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 0.7511762976646423, |
|
"learning_rate": 7.883631337489708e-05, |
|
"loss": 3.1692, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 0.7737910151481628, |
|
"learning_rate": 7.854223032584402e-05, |
|
"loss": 3.1706, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 0.7110986113548279, |
|
"learning_rate": 7.824844135984002e-05, |
|
"loss": 3.1682, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.7741941213607788, |
|
"learning_rate": 7.795435831078697e-05, |
|
"loss": 3.1632, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.7159838676452637, |
|
"learning_rate": 7.766056934478297e-05, |
|
"loss": 3.1684, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.7344425916671753, |
|
"learning_rate": 7.736648629572993e-05, |
|
"loss": 3.1628, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 0.7268814444541931, |
|
"learning_rate": 7.707240324667686e-05, |
|
"loss": 3.1657, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 0.7119531631469727, |
|
"learning_rate": 7.677861428067287e-05, |
|
"loss": 3.1683, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.3964901341200214, |
|
"eval_loss": 3.4386308193206787, |
|
"eval_runtime": 154.2891, |
|
"eval_samples_per_second": 375.406, |
|
"eval_steps_per_second": 5.872, |
|
"step": 111612 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 0.7270282506942749, |
|
"learning_rate": 7.648482531466886e-05, |
|
"loss": 3.1402, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"grad_norm": 0.759936511516571, |
|
"learning_rate": 7.619074226561581e-05, |
|
"loss": 3.1064, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 0.7575330138206482, |
|
"learning_rate": 7.589665921656276e-05, |
|
"loss": 3.1131, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 0.7685191631317139, |
|
"learning_rate": 7.560257616750971e-05, |
|
"loss": 3.1157, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.7537042498588562, |
|
"learning_rate": 7.53087872015057e-05, |
|
"loss": 3.1189, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"grad_norm": 0.7388986945152283, |
|
"learning_rate": 7.501470415245265e-05, |
|
"loss": 3.119, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"grad_norm": 0.7611464262008667, |
|
"learning_rate": 7.47206211033996e-05, |
|
"loss": 3.1211, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.7273160815238953, |
|
"learning_rate": 7.442653805434656e-05, |
|
"loss": 3.1224, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 0.7437482476234436, |
|
"learning_rate": 7.413274908834256e-05, |
|
"loss": 3.1248, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.7418404221534729, |
|
"learning_rate": 7.38386660392895e-05, |
|
"loss": 3.1241, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 0.7766395807266235, |
|
"learning_rate": 7.354458299023645e-05, |
|
"loss": 3.1262, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"grad_norm": 0.7430386543273926, |
|
"learning_rate": 7.325049994118339e-05, |
|
"loss": 3.1262, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.7568381428718567, |
|
"learning_rate": 7.29567109751794e-05, |
|
"loss": 3.1218, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.742994487285614, |
|
"learning_rate": 7.266292200917539e-05, |
|
"loss": 3.1226, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"grad_norm": 0.7728854417800903, |
|
"learning_rate": 7.236883896012234e-05, |
|
"loss": 3.1304, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.7517836093902588, |
|
"learning_rate": 7.207504999411833e-05, |
|
"loss": 3.1249, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.7383127808570862, |
|
"learning_rate": 7.178096694506528e-05, |
|
"loss": 3.1212, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"grad_norm": 0.7259215712547302, |
|
"learning_rate": 7.148717797906129e-05, |
|
"loss": 3.1247, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 0.7265549898147583, |
|
"learning_rate": 7.119309493000824e-05, |
|
"loss": 3.122, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.3987158276834467, |
|
"eval_loss": 3.423013687133789, |
|
"eval_runtime": 154.1006, |
|
"eval_samples_per_second": 375.865, |
|
"eval_steps_per_second": 5.879, |
|
"step": 130214 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.757236897945404, |
|
"learning_rate": 7.089901188095519e-05, |
|
"loss": 3.0764, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.7935017347335815, |
|
"learning_rate": 7.060492883190213e-05, |
|
"loss": 3.0706, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 0.7874728441238403, |
|
"learning_rate": 7.031084578284908e-05, |
|
"loss": 3.076, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.7797748446464539, |
|
"learning_rate": 7.001705681684508e-05, |
|
"loss": 3.073, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 0.7578076124191284, |
|
"learning_rate": 6.972326785084107e-05, |
|
"loss": 3.077, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 0.756395161151886, |
|
"learning_rate": 6.942918480178802e-05, |
|
"loss": 3.0835, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 0.720391035079956, |
|
"learning_rate": 6.913539583578404e-05, |
|
"loss": 3.0818, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.7686936259269714, |
|
"learning_rate": 6.884131278673098e-05, |
|
"loss": 3.0802, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"grad_norm": 0.7382326722145081, |
|
"learning_rate": 6.854722973767792e-05, |
|
"loss": 3.0847, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"grad_norm": 0.7340770959854126, |
|
"learning_rate": 6.825314668862487e-05, |
|
"loss": 3.0861, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.7532581090927124, |
|
"learning_rate": 6.795935772262087e-05, |
|
"loss": 3.0868, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"grad_norm": 0.7558372020721436, |
|
"learning_rate": 6.766527467356781e-05, |
|
"loss": 3.0869, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.7301390767097473, |
|
"learning_rate": 6.737119162451476e-05, |
|
"loss": 3.0858, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 0.740253210067749, |
|
"learning_rate": 6.707740265851076e-05, |
|
"loss": 3.0902, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"grad_norm": 0.765359103679657, |
|
"learning_rate": 6.678331960945772e-05, |
|
"loss": 3.0886, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.7424470782279968, |
|
"learning_rate": 6.648953064345372e-05, |
|
"loss": 3.0862, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"grad_norm": 0.7344245314598083, |
|
"learning_rate": 6.619544759440066e-05, |
|
"loss": 3.0876, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.7477200627326965, |
|
"learning_rate": 6.590165862839667e-05, |
|
"loss": 3.0883, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4019737201422278, |
|
"eval_loss": 3.410255193710327, |
|
"eval_runtime": 153.6826, |
|
"eval_samples_per_second": 376.887, |
|
"eval_steps_per_second": 5.895, |
|
"step": 148816 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.7679169774055481, |
|
"learning_rate": 6.560757557934361e-05, |
|
"loss": 3.0781, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 0.7777008414268494, |
|
"learning_rate": 6.531349253029055e-05, |
|
"loss": 3.0328, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.7723888754844666, |
|
"learning_rate": 6.50194094812375e-05, |
|
"loss": 3.0361, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 0.7410120964050293, |
|
"learning_rate": 6.47256205152335e-05, |
|
"loss": 3.0396, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.22, |
|
"grad_norm": 0.775367021560669, |
|
"learning_rate": 6.443153746618044e-05, |
|
"loss": 3.0435, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.7430744767189026, |
|
"learning_rate": 6.413774850017646e-05, |
|
"loss": 3.0487, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"grad_norm": 0.7859558463096619, |
|
"learning_rate": 6.38436654511234e-05, |
|
"loss": 3.0461, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"grad_norm": 0.7532193064689636, |
|
"learning_rate": 6.354958240207035e-05, |
|
"loss": 3.0479, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.7526784539222717, |
|
"learning_rate": 6.325579343606635e-05, |
|
"loss": 3.0521, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"grad_norm": 0.7687983512878418, |
|
"learning_rate": 6.296171038701329e-05, |
|
"loss": 3.0505, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 0.7615057229995728, |
|
"learning_rate": 6.26679214210093e-05, |
|
"loss": 3.0543, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.7644217014312744, |
|
"learning_rate": 6.237383837195624e-05, |
|
"loss": 3.0556, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"grad_norm": 0.7489671111106873, |
|
"learning_rate": 6.20797553229032e-05, |
|
"loss": 3.0523, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"grad_norm": 0.7821494936943054, |
|
"learning_rate": 6.17859663568992e-05, |
|
"loss": 3.0606, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"grad_norm": 0.7804410457611084, |
|
"learning_rate": 6.149188330784615e-05, |
|
"loss": 3.0582, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 0.7675402164459229, |
|
"learning_rate": 6.119809434184214e-05, |
|
"loss": 3.0566, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.7614428400993347, |
|
"learning_rate": 6.090401129278909e-05, |
|
"loss": 3.0547, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 0.7919172644615173, |
|
"learning_rate": 6.0609928243736035e-05, |
|
"loss": 3.0551, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.7789074182510376, |
|
"learning_rate": 6.031613927773203e-05, |
|
"loss": 3.059, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4021935956374767, |
|
"eval_loss": 3.4160611629486084, |
|
"eval_runtime": 154.2434, |
|
"eval_samples_per_second": 375.517, |
|
"eval_steps_per_second": 5.874, |
|
"step": 167418 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 0.796655535697937, |
|
"learning_rate": 6.002205622867898e-05, |
|
"loss": 3.0247, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 0.7840985059738159, |
|
"learning_rate": 5.972797317962593e-05, |
|
"loss": 3.0027, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 0.8092240691184998, |
|
"learning_rate": 5.943418421362194e-05, |
|
"loss": 3.0089, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 0.7674739360809326, |
|
"learning_rate": 5.9140101164568875e-05, |
|
"loss": 3.0123, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.7991486191749573, |
|
"learning_rate": 5.884631219856488e-05, |
|
"loss": 3.0115, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 0.803733766078949, |
|
"learning_rate": 5.855222914951183e-05, |
|
"loss": 3.0176, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.35, |
|
"grad_norm": 0.7872689962387085, |
|
"learning_rate": 5.8258440183507825e-05, |
|
"loss": 3.0212, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"grad_norm": 0.7754504680633545, |
|
"learning_rate": 5.796435713445477e-05, |
|
"loss": 3.0189, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 0.7785583138465881, |
|
"learning_rate": 5.767027408540172e-05, |
|
"loss": 3.0226, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 0.7682950496673584, |
|
"learning_rate": 5.737619103634867e-05, |
|
"loss": 3.0225, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 0.8099873065948486, |
|
"learning_rate": 5.708240207034466e-05, |
|
"loss": 3.024, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 0.7960034012794495, |
|
"learning_rate": 5.6788319021291615e-05, |
|
"loss": 3.024, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 0.8109017014503479, |
|
"learning_rate": 5.649453005528762e-05, |
|
"loss": 3.0264, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"grad_norm": 0.7674356698989868, |
|
"learning_rate": 5.6200447006234556e-05, |
|
"loss": 3.0301, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"grad_norm": 0.789279580116272, |
|
"learning_rate": 5.5906658040230565e-05, |
|
"loss": 3.0311, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 0.7678402066230774, |
|
"learning_rate": 5.561257499117751e-05, |
|
"loss": 3.0311, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"grad_norm": 0.7561872601509094, |
|
"learning_rate": 5.5318786025173505e-05, |
|
"loss": 3.0295, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"grad_norm": 0.8036589026451111, |
|
"learning_rate": 5.502470297612046e-05, |
|
"loss": 3.0335, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.7822810411453247, |
|
"learning_rate": 5.47306199270674e-05, |
|
"loss": 3.0294, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.40386782023038625, |
|
"eval_loss": 3.4003984928131104, |
|
"eval_runtime": 154.1461, |
|
"eval_samples_per_second": 375.754, |
|
"eval_steps_per_second": 5.878, |
|
"step": 186020 |
|
}, |
|
{ |
|
"epoch": 10.05, |
|
"grad_norm": 0.7965357303619385, |
|
"learning_rate": 5.443683096106341e-05, |
|
"loss": 2.9764, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"grad_norm": 0.7956034541130066, |
|
"learning_rate": 5.414274791201035e-05, |
|
"loss": 2.9774, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"grad_norm": 0.7924751043319702, |
|
"learning_rate": 5.38486648629573e-05, |
|
"loss": 2.9838, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.21, |
|
"grad_norm": 0.8389952182769775, |
|
"learning_rate": 5.355458181390425e-05, |
|
"loss": 2.9866, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"grad_norm": 0.8086053133010864, |
|
"learning_rate": 5.3260792847900256e-05, |
|
"loss": 2.9918, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"grad_norm": 0.7812045216560364, |
|
"learning_rate": 5.2966709798847195e-05, |
|
"loss": 2.9899, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"grad_norm": 0.8031854629516602, |
|
"learning_rate": 5.26729208328432e-05, |
|
"loss": 2.9933, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"grad_norm": 0.8311526775360107, |
|
"learning_rate": 5.237913186683919e-05, |
|
"loss": 2.9941, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.48, |
|
"grad_norm": 0.8191061019897461, |
|
"learning_rate": 5.2085048817786145e-05, |
|
"loss": 2.9984, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"grad_norm": 0.8086720705032349, |
|
"learning_rate": 5.179096576873309e-05, |
|
"loss": 2.9989, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.59, |
|
"grad_norm": 0.8062707781791687, |
|
"learning_rate": 5.14971768027291e-05, |
|
"loss": 2.9972, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.64, |
|
"grad_norm": 0.7883285284042358, |
|
"learning_rate": 5.1203093753676044e-05, |
|
"loss": 3.0017, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"grad_norm": 0.8313503265380859, |
|
"learning_rate": 5.090901070462298e-05, |
|
"loss": 2.999, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"grad_norm": 0.8046980500221252, |
|
"learning_rate": 5.061522173861899e-05, |
|
"loss": 3.0024, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"grad_norm": 0.804289698600769, |
|
"learning_rate": 5.032113868956594e-05, |
|
"loss": 3.0023, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"grad_norm": 0.7925947308540344, |
|
"learning_rate": 5.002705564051289e-05, |
|
"loss": 3.005, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"grad_norm": 0.7959842681884766, |
|
"learning_rate": 4.9733266674508884e-05, |
|
"loss": 3.0082, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"grad_norm": 0.7851746082305908, |
|
"learning_rate": 4.9439477708504887e-05, |
|
"loss": 3.0081, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.4041393560543386, |
|
"eval_loss": 3.404827833175659, |
|
"eval_runtime": 153.9153, |
|
"eval_samples_per_second": 376.317, |
|
"eval_steps_per_second": 5.886, |
|
"step": 204622 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"grad_norm": 0.7984915375709534, |
|
"learning_rate": 4.914539465945183e-05, |
|
"loss": 2.9866, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.07, |
|
"grad_norm": 0.832140326499939, |
|
"learning_rate": 4.8851311610398784e-05, |
|
"loss": 2.9503, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"grad_norm": 0.8821794390678406, |
|
"learning_rate": 4.855722856134572e-05, |
|
"loss": 2.9585, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.18, |
|
"grad_norm": 0.861076295375824, |
|
"learning_rate": 4.8263439595341724e-05, |
|
"loss": 2.963, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"grad_norm": 0.8196221590042114, |
|
"learning_rate": 4.7969356546288676e-05, |
|
"loss": 2.9638, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.29, |
|
"grad_norm": 0.8027163147926331, |
|
"learning_rate": 4.767556758028467e-05, |
|
"loss": 2.9643, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.34, |
|
"grad_norm": 0.8131231069564819, |
|
"learning_rate": 4.7381484531231624e-05, |
|
"loss": 2.9705, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 0.8047467470169067, |
|
"learning_rate": 4.708740148217857e-05, |
|
"loss": 2.9758, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.45, |
|
"grad_norm": 0.8449523448944092, |
|
"learning_rate": 4.6793612516174565e-05, |
|
"loss": 2.9711, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.5, |
|
"grad_norm": 0.8473635315895081, |
|
"learning_rate": 4.6499529467121517e-05, |
|
"loss": 2.9766, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 0.8421896696090698, |
|
"learning_rate": 4.620574050111752e-05, |
|
"loss": 2.9753, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.61, |
|
"grad_norm": 0.8112220764160156, |
|
"learning_rate": 4.5911657452064464e-05, |
|
"loss": 2.9797, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"grad_norm": 0.8139015436172485, |
|
"learning_rate": 4.5617868486060466e-05, |
|
"loss": 2.9775, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"grad_norm": 0.8164324164390564, |
|
"learning_rate": 4.532378543700742e-05, |
|
"loss": 2.9746, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.77, |
|
"grad_norm": 0.8094378113746643, |
|
"learning_rate": 4.502970238795436e-05, |
|
"loss": 2.9792, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 0.8222824931144714, |
|
"learning_rate": 4.473561933890131e-05, |
|
"loss": 2.981, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"grad_norm": 0.8311740756034851, |
|
"learning_rate": 4.4441536289848254e-05, |
|
"loss": 2.9817, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.93, |
|
"grad_norm": 0.809837818145752, |
|
"learning_rate": 4.41474532407952e-05, |
|
"loss": 2.9842, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.99, |
|
"grad_norm": 0.80256187915802, |
|
"learning_rate": 4.385395835784026e-05, |
|
"loss": 2.9849, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.40463795548169496, |
|
"eval_loss": 3.406789779663086, |
|
"eval_runtime": 154.0508, |
|
"eval_samples_per_second": 375.986, |
|
"eval_steps_per_second": 5.881, |
|
"step": 223224 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"grad_norm": 0.8648255467414856, |
|
"learning_rate": 4.3559875308787204e-05, |
|
"loss": 2.9414, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"grad_norm": 0.8371706008911133, |
|
"learning_rate": 4.326579225973415e-05, |
|
"loss": 2.9348, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.15, |
|
"grad_norm": 0.8452836871147156, |
|
"learning_rate": 4.297200329373015e-05, |
|
"loss": 2.9398, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"grad_norm": 0.8612444996833801, |
|
"learning_rate": 4.26779202446771e-05, |
|
"loss": 2.9431, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.26, |
|
"grad_norm": 0.8397490978240967, |
|
"learning_rate": 4.238383719562405e-05, |
|
"loss": 2.9411, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.31, |
|
"grad_norm": 0.8516615629196167, |
|
"learning_rate": 4.209004822962005e-05, |
|
"loss": 2.9496, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.36, |
|
"grad_norm": 0.866407573223114, |
|
"learning_rate": 4.179625926361605e-05, |
|
"loss": 2.9463, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.42, |
|
"grad_norm": 0.8084841370582581, |
|
"learning_rate": 4.150217621456299e-05, |
|
"loss": 2.949, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.47, |
|
"grad_norm": 0.8507031202316284, |
|
"learning_rate": 4.1208387248558994e-05, |
|
"loss": 2.9496, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"grad_norm": 0.82049560546875, |
|
"learning_rate": 4.0914304199505946e-05, |
|
"loss": 2.9493, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.58, |
|
"grad_norm": 0.843075156211853, |
|
"learning_rate": 4.062022115045289e-05, |
|
"loss": 2.9587, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.63, |
|
"grad_norm": 0.8294638991355896, |
|
"learning_rate": 4.0326138101399836e-05, |
|
"loss": 2.9545, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.69, |
|
"grad_norm": 0.8557348251342773, |
|
"learning_rate": 4.003205505234678e-05, |
|
"loss": 2.9546, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.74, |
|
"grad_norm": 0.8104209303855896, |
|
"learning_rate": 3.9738266086342784e-05, |
|
"loss": 2.9598, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.79, |
|
"grad_norm": 0.8661037683486938, |
|
"learning_rate": 3.9444183037289735e-05, |
|
"loss": 2.9611, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"grad_norm": 0.8609908223152161, |
|
"learning_rate": 3.915009998823668e-05, |
|
"loss": 2.9567, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.9, |
|
"grad_norm": 0.837121307849884, |
|
"learning_rate": 3.8856016939183626e-05, |
|
"loss": 2.9652, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 0.8167340755462646, |
|
"learning_rate": 3.856222797317963e-05, |
|
"loss": 2.9618, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.4047847406809369, |
|
"eval_loss": 3.412700653076172, |
|
"eval_runtime": 153.7925, |
|
"eval_samples_per_second": 376.618, |
|
"eval_steps_per_second": 5.891, |
|
"step": 241826 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"grad_norm": 0.8533641695976257, |
|
"learning_rate": 3.826814492412657e-05, |
|
"loss": 2.9526, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.06, |
|
"grad_norm": 0.8695389032363892, |
|
"learning_rate": 3.7974061875073525e-05, |
|
"loss": 2.9135, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 0.8457789421081543, |
|
"learning_rate": 3.767997882602047e-05, |
|
"loss": 2.9143, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.17, |
|
"grad_norm": 0.8333380818367004, |
|
"learning_rate": 3.7386189860016466e-05, |
|
"loss": 2.9226, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.22, |
|
"grad_norm": 0.8502881526947021, |
|
"learning_rate": 3.7092694977061526e-05, |
|
"loss": 2.9231, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 0.8744940757751465, |
|
"learning_rate": 3.679861192800847e-05, |
|
"loss": 2.926, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.33, |
|
"grad_norm": 0.867820143699646, |
|
"learning_rate": 3.6504528878955416e-05, |
|
"loss": 2.9293, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"grad_norm": 0.8131315112113953, |
|
"learning_rate": 3.621044582990237e-05, |
|
"loss": 2.9304, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"grad_norm": 0.8412694931030273, |
|
"learning_rate": 3.591636278084931e-05, |
|
"loss": 2.9291, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.49, |
|
"grad_norm": 0.8476005792617798, |
|
"learning_rate": 3.562257381484531e-05, |
|
"loss": 2.9297, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"grad_norm": 0.8458747863769531, |
|
"learning_rate": 3.532878484884131e-05, |
|
"loss": 2.9349, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 0.8815217614173889, |
|
"learning_rate": 3.503470179978826e-05, |
|
"loss": 2.9321, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.65, |
|
"grad_norm": 0.8400765061378479, |
|
"learning_rate": 3.474061875073521e-05, |
|
"loss": 2.9336, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"grad_norm": 0.8505930304527283, |
|
"learning_rate": 3.444653570168216e-05, |
|
"loss": 2.9395, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.76, |
|
"grad_norm": 0.8682374358177185, |
|
"learning_rate": 3.415274673567816e-05, |
|
"loss": 2.9371, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"grad_norm": 0.8451393246650696, |
|
"learning_rate": 3.38586636866251e-05, |
|
"loss": 2.9445, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.87, |
|
"grad_norm": 0.8744685053825378, |
|
"learning_rate": 3.35648747206211e-05, |
|
"loss": 2.938, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"grad_norm": 0.8651999831199646, |
|
"learning_rate": 3.3270791671568055e-05, |
|
"loss": 2.9409, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"grad_norm": 0.8568054437637329, |
|
"learning_rate": 3.2976708622515e-05, |
|
"loss": 2.9398, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.40543314834367516, |
|
"eval_loss": 3.4078917503356934, |
|
"eval_runtime": 153.8367, |
|
"eval_samples_per_second": 376.51, |
|
"eval_steps_per_second": 5.889, |
|
"step": 260428 |
|
}, |
|
{ |
|
"epoch": 14.03, |
|
"grad_norm": 0.8709078431129456, |
|
"learning_rate": 3.2682919656511e-05, |
|
"loss": 2.914, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.08, |
|
"grad_norm": 0.874599039554596, |
|
"learning_rate": 3.238883660745795e-05, |
|
"loss": 2.8959, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.14, |
|
"grad_norm": 0.8708846569061279, |
|
"learning_rate": 3.209475355840489e-05, |
|
"loss": 2.8994, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.19, |
|
"grad_norm": 0.9044806361198425, |
|
"learning_rate": 3.1800964592400895e-05, |
|
"loss": 2.9021, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.8368488550186157, |
|
"learning_rate": 3.150688154334784e-05, |
|
"loss": 2.9034, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.3, |
|
"grad_norm": 0.8695858120918274, |
|
"learning_rate": 3.121279849429479e-05, |
|
"loss": 2.9071, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.35, |
|
"grad_norm": 0.8245272636413574, |
|
"learning_rate": 3.0919009528290795e-05, |
|
"loss": 2.9094, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.41, |
|
"grad_norm": 0.8766151666641235, |
|
"learning_rate": 3.062492647923774e-05, |
|
"loss": 2.9142, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.46, |
|
"grad_norm": 0.8801271319389343, |
|
"learning_rate": 3.033113751323374e-05, |
|
"loss": 2.9104, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.51, |
|
"grad_norm": 0.8736371994018555, |
|
"learning_rate": 3.0037054464180687e-05, |
|
"loss": 2.9144, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.57, |
|
"grad_norm": 0.8868771195411682, |
|
"learning_rate": 2.9743265498176686e-05, |
|
"loss": 2.9199, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.62, |
|
"grad_norm": 0.8758333325386047, |
|
"learning_rate": 2.9449182449123635e-05, |
|
"loss": 2.9148, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"grad_norm": 0.8853691220283508, |
|
"learning_rate": 2.9155393483119637e-05, |
|
"loss": 2.9151, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.73, |
|
"grad_norm": 0.8897860646247864, |
|
"learning_rate": 2.886131043406658e-05, |
|
"loss": 2.9161, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.78, |
|
"grad_norm": 0.8651039600372314, |
|
"learning_rate": 2.856752146806258e-05, |
|
"loss": 2.9228, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.84, |
|
"grad_norm": 0.8585176467895508, |
|
"learning_rate": 2.827343841900953e-05, |
|
"loss": 2.9196, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.89, |
|
"grad_norm": 0.8638288974761963, |
|
"learning_rate": 2.797935536995648e-05, |
|
"loss": 2.9218, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.94, |
|
"grad_norm": 0.8799272775650024, |
|
"learning_rate": 2.7685566403952477e-05, |
|
"loss": 2.9239, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.8739799857139587, |
|
"learning_rate": 2.7391483354899426e-05, |
|
"loss": 2.9226, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.406453792523118, |
|
"eval_loss": 3.3963141441345215, |
|
"eval_runtime": 153.9816, |
|
"eval_samples_per_second": 376.155, |
|
"eval_steps_per_second": 5.884, |
|
"step": 279030 |
|
}, |
|
{ |
|
"epoch": 15.05, |
|
"grad_norm": 0.8708630204200745, |
|
"learning_rate": 2.709740030584637e-05, |
|
"loss": 2.8824, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.11, |
|
"grad_norm": 0.9101833701133728, |
|
"learning_rate": 2.680331725679332e-05, |
|
"loss": 2.8831, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.16, |
|
"grad_norm": 0.9030382633209229, |
|
"learning_rate": 2.6509528290789322e-05, |
|
"loss": 2.8836, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.21, |
|
"grad_norm": 0.8790870308876038, |
|
"learning_rate": 2.621544524173627e-05, |
|
"loss": 2.8874, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"grad_norm": 0.8721250295639038, |
|
"learning_rate": 2.592165627573227e-05, |
|
"loss": 2.8902, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.32, |
|
"grad_norm": 0.9108452200889587, |
|
"learning_rate": 2.5627573226679215e-05, |
|
"loss": 2.892, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.37, |
|
"grad_norm": 0.9115946888923645, |
|
"learning_rate": 2.5333490177626163e-05, |
|
"loss": 2.8947, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.43, |
|
"grad_norm": 0.9250348806381226, |
|
"learning_rate": 2.5039701211622162e-05, |
|
"loss": 2.8961, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.48, |
|
"grad_norm": 0.8640110492706299, |
|
"learning_rate": 2.474561816256911e-05, |
|
"loss": 2.894, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.54, |
|
"grad_norm": 0.8953117728233337, |
|
"learning_rate": 2.445153511351606e-05, |
|
"loss": 2.8987, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.59, |
|
"grad_norm": 0.9380891919136047, |
|
"learning_rate": 2.4157452064463004e-05, |
|
"loss": 2.8939, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.64, |
|
"grad_norm": 0.8643067479133606, |
|
"learning_rate": 2.386395718150806e-05, |
|
"loss": 2.8975, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.7, |
|
"grad_norm": 0.8749162554740906, |
|
"learning_rate": 2.3569874132455006e-05, |
|
"loss": 2.9003, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.75, |
|
"grad_norm": 0.8664742112159729, |
|
"learning_rate": 2.3275791083401954e-05, |
|
"loss": 2.9002, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.8, |
|
"grad_norm": 0.9035205245018005, |
|
"learning_rate": 2.29817080343489e-05, |
|
"loss": 2.9042, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.86, |
|
"grad_norm": 0.8949082493782043, |
|
"learning_rate": 2.2687624985295848e-05, |
|
"loss": 2.9004, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.91, |
|
"grad_norm": 0.8694750070571899, |
|
"learning_rate": 2.23941301023409e-05, |
|
"loss": 2.9055, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.97, |
|
"grad_norm": 0.898671567440033, |
|
"learning_rate": 2.210004705328785e-05, |
|
"loss": 2.9009, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.40682327471800617, |
|
"eval_loss": 3.4036316871643066, |
|
"eval_runtime": 153.8703, |
|
"eval_samples_per_second": 376.428, |
|
"eval_steps_per_second": 5.888, |
|
"step": 297632 |
|
}, |
|
{ |
|
"epoch": 16.02, |
|
"grad_norm": 0.8959853053092957, |
|
"learning_rate": 2.1805964004234798e-05, |
|
"loss": 2.8874, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.07, |
|
"grad_norm": 0.9309787154197693, |
|
"learning_rate": 2.1511880955181746e-05, |
|
"loss": 2.8624, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.13, |
|
"grad_norm": 0.8814511299133301, |
|
"learning_rate": 2.1218091989177742e-05, |
|
"loss": 2.8708, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.18, |
|
"grad_norm": 0.9105135202407837, |
|
"learning_rate": 2.0924008940124694e-05, |
|
"loss": 2.871, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.23, |
|
"grad_norm": 0.902424156665802, |
|
"learning_rate": 2.062992589107164e-05, |
|
"loss": 2.8719, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.29, |
|
"grad_norm": 0.9054093956947327, |
|
"learning_rate": 2.0335842842018588e-05, |
|
"loss": 2.8707, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.34, |
|
"grad_norm": 0.9166373610496521, |
|
"learning_rate": 2.0042053876014587e-05, |
|
"loss": 2.8774, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 0.9339081645011902, |
|
"learning_rate": 1.9747970826961535e-05, |
|
"loss": 2.879, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.45, |
|
"grad_norm": 0.9023671746253967, |
|
"learning_rate": 1.9454181860957534e-05, |
|
"loss": 2.8791, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 0.9114070534706116, |
|
"learning_rate": 1.9160098811904483e-05, |
|
"loss": 2.8824, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.56, |
|
"grad_norm": 0.8930648565292358, |
|
"learning_rate": 1.886601576285143e-05, |
|
"loss": 2.8833, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.61, |
|
"grad_norm": 0.9150155186653137, |
|
"learning_rate": 1.857222679684743e-05, |
|
"loss": 2.8808, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.66, |
|
"grad_norm": 0.9177570343017578, |
|
"learning_rate": 1.8278143747794375e-05, |
|
"loss": 2.885, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.72, |
|
"grad_norm": 0.9326618313789368, |
|
"learning_rate": 1.7984060698741327e-05, |
|
"loss": 2.8825, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.77, |
|
"grad_norm": 0.9235589504241943, |
|
"learning_rate": 1.7690271732737326e-05, |
|
"loss": 2.8832, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.83, |
|
"grad_norm": 0.9126114249229431, |
|
"learning_rate": 1.739618868368427e-05, |
|
"loss": 2.8817, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.88, |
|
"grad_norm": 0.8965770602226257, |
|
"learning_rate": 1.7102399717680274e-05, |
|
"loss": 2.8865, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.93, |
|
"grad_norm": 0.9391665458679199, |
|
"learning_rate": 1.6808316668627222e-05, |
|
"loss": 2.8835, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 16.99, |
|
"grad_norm": 0.9226199984550476, |
|
"learning_rate": 1.651452770262322e-05, |
|
"loss": 2.8845, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.4067043014512522, |
|
"eval_loss": 3.409031867980957, |
|
"eval_runtime": 153.8139, |
|
"eval_samples_per_second": 376.565, |
|
"eval_steps_per_second": 5.89, |
|
"step": 316234 |
|
}, |
|
{ |
|
"epoch": 17.04, |
|
"grad_norm": 0.9422922730445862, |
|
"learning_rate": 1.622044465357017e-05, |
|
"loss": 2.8622, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.09, |
|
"grad_norm": 0.8947985172271729, |
|
"learning_rate": 1.592665568756617e-05, |
|
"loss": 2.8557, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.15, |
|
"grad_norm": 0.9117223620414734, |
|
"learning_rate": 1.5632572638513114e-05, |
|
"loss": 2.8571, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"grad_norm": 0.9183168411254883, |
|
"learning_rate": 1.5338783672509116e-05, |
|
"loss": 2.8595, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.26, |
|
"grad_norm": 0.9229071736335754, |
|
"learning_rate": 1.5044700623456065e-05, |
|
"loss": 2.863, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.31, |
|
"grad_norm": 0.9431742429733276, |
|
"learning_rate": 1.4750617574403012e-05, |
|
"loss": 2.8624, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.36, |
|
"grad_norm": 0.9331133365631104, |
|
"learning_rate": 1.4456828608399012e-05, |
|
"loss": 2.861, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.42, |
|
"grad_norm": 0.9034436941146851, |
|
"learning_rate": 1.4162745559345961e-05, |
|
"loss": 2.8647, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.47, |
|
"grad_norm": 0.9203888773918152, |
|
"learning_rate": 1.3868662510292906e-05, |
|
"loss": 2.8648, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.52, |
|
"grad_norm": 0.9167985320091248, |
|
"learning_rate": 1.3574873544288907e-05, |
|
"loss": 2.8638, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.58, |
|
"grad_norm": 0.9315596222877502, |
|
"learning_rate": 1.3280790495235857e-05, |
|
"loss": 2.8658, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.63, |
|
"grad_norm": 0.883280336856842, |
|
"learning_rate": 1.2987001529231858e-05, |
|
"loss": 2.8622, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.69, |
|
"grad_norm": 0.9321721792221069, |
|
"learning_rate": 1.2692918480178803e-05, |
|
"loss": 2.86, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.74, |
|
"grad_norm": 0.9126923680305481, |
|
"learning_rate": 1.2398835431125751e-05, |
|
"loss": 2.8686, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.79, |
|
"grad_norm": 0.8995952010154724, |
|
"learning_rate": 1.2105046465121752e-05, |
|
"loss": 2.8676, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.85, |
|
"grad_norm": 0.920877993106842, |
|
"learning_rate": 1.1810963416068699e-05, |
|
"loss": 2.8663, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.9, |
|
"grad_norm": 0.9144251942634583, |
|
"learning_rate": 1.1516880367015646e-05, |
|
"loss": 2.8701, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.96, |
|
"grad_norm": 0.9452138543128967, |
|
"learning_rate": 1.1222797317962593e-05, |
|
"loss": 2.8685, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.4071263172963953, |
|
"eval_loss": 3.405372381210327, |
|
"eval_runtime": 155.4934, |
|
"eval_samples_per_second": 372.498, |
|
"eval_steps_per_second": 5.827, |
|
"step": 334836 |
|
}, |
|
{ |
|
"epoch": 18.01, |
|
"grad_norm": 0.9502813220024109, |
|
"learning_rate": 1.0929008351958593e-05, |
|
"loss": 2.8643, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.06, |
|
"grad_norm": 0.9388921856880188, |
|
"learning_rate": 1.063492530290554e-05, |
|
"loss": 2.8412, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.12, |
|
"grad_norm": 0.9569390416145325, |
|
"learning_rate": 1.0341136336901541e-05, |
|
"loss": 2.8438, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.17, |
|
"grad_norm": 0.9436145424842834, |
|
"learning_rate": 1.004705328784849e-05, |
|
"loss": 2.8426, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.22, |
|
"grad_norm": 0.9303746223449707, |
|
"learning_rate": 9.752970238795436e-06, |
|
"loss": 2.8428, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.28, |
|
"grad_norm": 0.914296567440033, |
|
"learning_rate": 9.459181272791437e-06, |
|
"loss": 2.8489, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.33, |
|
"grad_norm": 0.9756318926811218, |
|
"learning_rate": 9.165098223738384e-06, |
|
"loss": 2.847, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.39, |
|
"grad_norm": 0.9441861510276794, |
|
"learning_rate": 8.871015174685332e-06, |
|
"loss": 2.8477, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.44, |
|
"grad_norm": 0.9245781302452087, |
|
"learning_rate": 8.577226208681333e-06, |
|
"loss": 2.8496, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.49, |
|
"grad_norm": 0.9517726302146912, |
|
"learning_rate": 8.283143159628278e-06, |
|
"loss": 2.8483, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.55, |
|
"grad_norm": 0.9460880160331726, |
|
"learning_rate": 7.989060110575227e-06, |
|
"loss": 2.8541, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.6, |
|
"grad_norm": 0.9118568301200867, |
|
"learning_rate": 7.694977061522173e-06, |
|
"loss": 2.8503, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.65, |
|
"grad_norm": 0.918589174747467, |
|
"learning_rate": 7.401188095518174e-06, |
|
"loss": 2.8526, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.71, |
|
"grad_norm": 0.9284429550170898, |
|
"learning_rate": 7.107399129514175e-06, |
|
"loss": 2.8541, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.76, |
|
"grad_norm": 0.954234778881073, |
|
"learning_rate": 6.8133160804611226e-06, |
|
"loss": 2.8519, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.82, |
|
"grad_norm": 0.9434486627578735, |
|
"learning_rate": 6.519527114457123e-06, |
|
"loss": 2.8508, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.87, |
|
"grad_norm": 0.9406701922416687, |
|
"learning_rate": 6.22544406540407e-06, |
|
"loss": 2.8559, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.92, |
|
"grad_norm": 0.9509878158569336, |
|
"learning_rate": 5.931361016351018e-06, |
|
"loss": 2.8522, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.98, |
|
"grad_norm": 0.9960227012634277, |
|
"learning_rate": 5.6372779672979655e-06, |
|
"loss": 2.8513, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.4068732555821874, |
|
"eval_loss": 3.4187262058258057, |
|
"eval_runtime": 154.0999, |
|
"eval_samples_per_second": 375.867, |
|
"eval_steps_per_second": 5.879, |
|
"step": 353438 |
|
}, |
|
{ |
|
"epoch": 19.03, |
|
"grad_norm": 0.9322279691696167, |
|
"learning_rate": 5.343194918244912e-06, |
|
"loss": 2.8414, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.08, |
|
"grad_norm": 0.9578444361686707, |
|
"learning_rate": 5.049405952240913e-06, |
|
"loss": 2.8361, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.14, |
|
"grad_norm": 0.9322285652160645, |
|
"learning_rate": 4.75532290318786e-06, |
|
"loss": 2.8365, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.19, |
|
"grad_norm": 0.9732059836387634, |
|
"learning_rate": 4.4612398541348075e-06, |
|
"loss": 2.8372, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 0.9745797514915466, |
|
"learning_rate": 4.167156805081755e-06, |
|
"loss": 2.8352, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.3, |
|
"grad_norm": 0.9456592202186584, |
|
"learning_rate": 3.873367839077756e-06, |
|
"loss": 2.8367, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.35, |
|
"grad_norm": 0.9315751791000366, |
|
"learning_rate": 3.5792847900247036e-06, |
|
"loss": 2.8368, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.41, |
|
"grad_norm": 0.8824998140335083, |
|
"learning_rate": 3.2857899070697564e-06, |
|
"loss": 2.8355, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.46, |
|
"grad_norm": 0.985062837600708, |
|
"learning_rate": 2.9917068580167037e-06, |
|
"loss": 2.8372, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.51, |
|
"grad_norm": 0.9548268914222717, |
|
"learning_rate": 2.6976238089636514e-06, |
|
"loss": 2.839, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.57, |
|
"grad_norm": 0.9308351278305054, |
|
"learning_rate": 2.403540759910599e-06, |
|
"loss": 2.8368, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.62, |
|
"grad_norm": 0.9744262099266052, |
|
"learning_rate": 2.1094577108575463e-06, |
|
"loss": 2.8369, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.68, |
|
"grad_norm": 0.962074875831604, |
|
"learning_rate": 1.8153746618044938e-06, |
|
"loss": 2.8354, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.73, |
|
"grad_norm": 0.9616215229034424, |
|
"learning_rate": 1.521585695800494e-06, |
|
"loss": 2.8404, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.78, |
|
"grad_norm": 0.9153415560722351, |
|
"learning_rate": 1.2277967297964945e-06, |
|
"loss": 2.8384, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.84, |
|
"grad_norm": 0.9574062824249268, |
|
"learning_rate": 9.337136807434419e-07, |
|
"loss": 2.8356, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.89, |
|
"grad_norm": 0.930395781993866, |
|
"learning_rate": 6.396306316903894e-07, |
|
"loss": 2.8368, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.94, |
|
"grad_norm": 0.9612599015235901, |
|
"learning_rate": 3.4584166568638985e-07, |
|
"loss": 2.8348, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.9463599920272827, |
|
"learning_rate": 5.175861663333726e-08, |
|
"loss": 2.8368, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.40681131693060796, |
|
"eval_loss": 3.4240403175354004, |
|
"eval_runtime": 155.2617, |
|
"eval_samples_per_second": 373.054, |
|
"eval_steps_per_second": 5.835, |
|
"step": 372040 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 372040, |
|
"total_flos": 1.56748665397248e+18, |
|
"train_loss": 3.1496440902626293, |
|
"train_runtime": 81299.3395, |
|
"train_samples_per_second": 146.434, |
|
"train_steps_per_second": 4.576 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 372040, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.56748665397248e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|