|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9798818457608176, |
|
"eval_steps": 500, |
|
"global_step": 24800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 1.9999968642467102e-05, |
|
"loss": 4.2386, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.999987438156715e-05, |
|
"loss": 3.1965, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.9999717217822316e-05, |
|
"loss": 2.7844, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.999949715222121e-05, |
|
"loss": 2.6013, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.9999214186148133e-05, |
|
"loss": 2.5417, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.9998868321383038e-05, |
|
"loss": 2.4376, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.9998459560101546e-05, |
|
"loss": 2.3875, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.9997987904874905e-05, |
|
"loss": 2.3568, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.9997453358670004e-05, |
|
"loss": 2.3034, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.9996855924849337e-05, |
|
"loss": 2.2779, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 1.999619560717097e-05, |
|
"loss": 2.2728, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.9995472409788548e-05, |
|
"loss": 2.2436, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.999468633725125e-05, |
|
"loss": 2.2062, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.9993837394503745e-05, |
|
"loss": 2.1873, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.99929255868862e-05, |
|
"loss": 2.1973, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.999195092013422e-05, |
|
"loss": 2.1891, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.99909134003788e-05, |
|
"loss": 2.1813, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.998981303414633e-05, |
|
"loss": 2.1609, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9988649828358504e-05, |
|
"loss": 2.1693, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.9987423790332315e-05, |
|
"loss": 2.1465, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.9986134927779986e-05, |
|
"loss": 2.1387, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.998478324880893e-05, |
|
"loss": 2.1236, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.9983368761921703e-05, |
|
"loss": 2.1144, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.9981891476015936e-05, |
|
"loss": 2.1164, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.99803514003843e-05, |
|
"loss": 2.1083, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.9978748544714427e-05, |
|
"loss": 2.0906, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.997708291908886e-05, |
|
"loss": 2.1043, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.9975354533984995e-05, |
|
"loss": 2.1028, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.9973563400274994e-05, |
|
"loss": 2.082, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.9971709529225754e-05, |
|
"loss": 2.0806, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.9969792932498783e-05, |
|
"loss": 2.0803, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.9967813622150177e-05, |
|
"loss": 2.0731, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.996577161063052e-05, |
|
"loss": 2.0662, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.99636669107848e-05, |
|
"loss": 2.0472, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.996149953585235e-05, |
|
"loss": 2.0562, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.9959269499466746e-05, |
|
"loss": 2.0587, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.9956976815655723e-05, |
|
"loss": 2.0576, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.99546214988411e-05, |
|
"loss": 2.0508, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.9952203563838676e-05, |
|
"loss": 2.034, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.9949723025858136e-05, |
|
"loss": 2.0259, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.994717990050297e-05, |
|
"loss": 2.0439, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.9944574203770365e-05, |
|
"loss": 2.0371, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.994190595205109e-05, |
|
"loss": 2.0375, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9939175162129427e-05, |
|
"loss": 2.0227, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.9936381851183032e-05, |
|
"loss": 2.0182, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.9933526036782852e-05, |
|
"loss": 2.0208, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.993060773689299e-05, |
|
"loss": 2.0177, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.992762696987062e-05, |
|
"loss": 2.0208, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.9924583754465842e-05, |
|
"loss": 1.9938, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.9921478109821598e-05, |
|
"loss": 2.0132, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.9918310055473515e-05, |
|
"loss": 2.0062, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.991507961134981e-05, |
|
"loss": 2.0074, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.9911786797771144e-05, |
|
"loss": 2.0153, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.990843163545052e-05, |
|
"loss": 1.996, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.990501414549312e-05, |
|
"loss": 2.0067, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.9901534349396204e-05, |
|
"loss": 1.9922, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.9897992269048953e-05, |
|
"loss": 1.9953, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.9894387926732342e-05, |
|
"loss": 1.9968, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.9890721345118987e-05, |
|
"loss": 1.9851, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.988699254727303e-05, |
|
"loss": 1.9749, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.988320155664996e-05, |
|
"loss": 2.003, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9879348397096482e-05, |
|
"loss": 1.9779, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.9875433092850376e-05, |
|
"loss": 1.9633, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.9871455668540325e-05, |
|
"loss": 1.9824, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.9867416149185774e-05, |
|
"loss": 1.9785, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.9863314560196775e-05, |
|
"loss": 1.9923, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9859150927373803e-05, |
|
"loss": 1.9839, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.9854925276907627e-05, |
|
"loss": 1.985, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.985063763537913e-05, |
|
"loss": 1.974, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.9846288029759124e-05, |
|
"loss": 1.9801, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.984187648740822e-05, |
|
"loss": 1.9733, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.983740303607662e-05, |
|
"loss": 1.9653, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.9832867703903953e-05, |
|
"loss": 1.9672, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.9828270519419115e-05, |
|
"loss": 1.9625, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9823611511540064e-05, |
|
"loss": 1.9542, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.9818890709573652e-05, |
|
"loss": 1.9475, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9814108143215446e-05, |
|
"loss": 1.9642, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9809263842549516e-05, |
|
"loss": 1.9541, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.980435783804828e-05, |
|
"loss": 1.956, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.9799390160572295e-05, |
|
"loss": 1.9812, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.979436084137005e-05, |
|
"loss": 1.9617, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.9789269912077792e-05, |
|
"loss": 1.9534, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9784117404719324e-05, |
|
"loss": 1.9519, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.977890335170578e-05, |
|
"loss": 1.9448, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.9773627785835454e-05, |
|
"loss": 1.9361, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.9768290740293573e-05, |
|
"loss": 1.9485, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.9762892248652093e-05, |
|
"loss": 1.9356, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.975743234486949e-05, |
|
"loss": 1.9484, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.9751911063290542e-05, |
|
"loss": 1.9358, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.974632843864612e-05, |
|
"loss": 1.9453, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.9740684506052958e-05, |
|
"loss": 1.9217, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.9734979301013445e-05, |
|
"loss": 1.9243, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.9729212859415397e-05, |
|
"loss": 1.9421, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.9723385217531824e-05, |
|
"loss": 1.9311, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.9717496412020717e-05, |
|
"loss": 1.9402, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9711546479924797e-05, |
|
"loss": 1.9433, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.9705535458671304e-05, |
|
"loss": 1.9181, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.9699463386071748e-05, |
|
"loss": 1.929, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.9693330300321666e-05, |
|
"loss": 1.941, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.96871362400004e-05, |
|
"loss": 1.9172, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.9680881244070848e-05, |
|
"loss": 1.9103, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.96745653518792e-05, |
|
"loss": 1.9323, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9668188603154716e-05, |
|
"loss": 1.9333, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.9661751038009463e-05, |
|
"loss": 1.9243, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.965525269693807e-05, |
|
"loss": 1.9386, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.9648693620817455e-05, |
|
"loss": 1.9293, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.96420738509066e-05, |
|
"loss": 1.9175, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.963539342884626e-05, |
|
"loss": 1.9176, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.9628652396658725e-05, |
|
"loss": 1.9182, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9621850796747528e-05, |
|
"loss": 1.9048, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.9614988671897208e-05, |
|
"loss": 1.9209, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.960806606527303e-05, |
|
"loss": 1.9064, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.96010830204207e-05, |
|
"loss": 1.9192, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.9594039581266107e-05, |
|
"loss": 1.9326, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.958693579211505e-05, |
|
"loss": 1.9194, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.957977169765294e-05, |
|
"loss": 1.8903, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.957254734294454e-05, |
|
"loss": 1.9135, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.956526277343366e-05, |
|
"loss": 1.9228, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.95579180349429e-05, |
|
"loss": 1.9094, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.955051317367333e-05, |
|
"loss": 1.9102, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.9543048236204215e-05, |
|
"loss": 1.8987, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.9535523269492733e-05, |
|
"loss": 1.9124, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.9527938320873652e-05, |
|
"loss": 1.9137, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.9520293438059065e-05, |
|
"loss": 1.9078, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.9512588669138055e-05, |
|
"loss": 1.9092, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.9504824062576425e-05, |
|
"loss": 1.9114, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.949699966721637e-05, |
|
"loss": 1.9121, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.9489115532276182e-05, |
|
"loss": 1.9139, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9481171707349936e-05, |
|
"loss": 1.8889, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.9473168242407183e-05, |
|
"loss": 1.9233, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.9465105187792617e-05, |
|
"loss": 1.8928, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.9456982594225787e-05, |
|
"loss": 1.9101, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9448800512800762e-05, |
|
"loss": 1.8862, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.9440558994985805e-05, |
|
"loss": 1.8912, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.943225809262306e-05, |
|
"loss": 1.8983, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.942389785792822e-05, |
|
"loss": 1.9031, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.94154783434902e-05, |
|
"loss": 1.9023, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.940699960227081e-05, |
|
"loss": 1.8974, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.939846168760441e-05, |
|
"loss": 1.9007, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.938986465319759e-05, |
|
"loss": 1.8949, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.9381208553128813e-05, |
|
"loss": 1.8864, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.9372493441848105e-05, |
|
"loss": 1.9024, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.9363719374176683e-05, |
|
"loss": 1.8891, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.935488640530662e-05, |
|
"loss": 1.8849, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9345994590800498e-05, |
|
"loss": 1.8939, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.9337043986591064e-05, |
|
"loss": 1.8903, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.9328034648980874e-05, |
|
"loss": 1.8731, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.9318966634641936e-05, |
|
"loss": 1.8781, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.9309840000615358e-05, |
|
"loss": 1.8855, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.930065480431098e-05, |
|
"loss": 1.89, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.9291411103507033e-05, |
|
"loss": 1.878, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.9282108956349754e-05, |
|
"loss": 1.8896, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9272748421353023e-05, |
|
"loss": 1.8763, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9263329557398012e-05, |
|
"loss": 1.8741, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.9253852423732803e-05, |
|
"loss": 1.8664, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.9244317079972007e-05, |
|
"loss": 1.8706, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.92347235860964e-05, |
|
"loss": 1.8791, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.9225072002452557e-05, |
|
"loss": 1.8834, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.9215362389752434e-05, |
|
"loss": 1.8849, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.9205594809073035e-05, |
|
"loss": 1.8804, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.9195769321855984e-05, |
|
"loss": 1.8717, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.9185885989907173e-05, |
|
"loss": 1.8701, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.917594487539635e-05, |
|
"loss": 1.8764, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9165946040856747e-05, |
|
"loss": 1.8695, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.9155889549184657e-05, |
|
"loss": 1.8747, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 1.9145775463639073e-05, |
|
"loss": 1.858, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.9135603847841266e-05, |
|
"loss": 1.8668, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.9125374765774404e-05, |
|
"loss": 1.8479, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.911508828178312e-05, |
|
"loss": 1.8627, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.9104744460573156e-05, |
|
"loss": 1.8924, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.909434336721089e-05, |
|
"loss": 1.8739, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.9083885067122985e-05, |
|
"loss": 1.8762, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.9073369626095958e-05, |
|
"loss": 1.8711, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.9062797110275743e-05, |
|
"loss": 1.8768, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.9052167586167315e-05, |
|
"loss": 1.8683, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9041481120634248e-05, |
|
"loss": 1.8697, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.9030737780898284e-05, |
|
"loss": 1.863, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.9019937634538946e-05, |
|
"loss": 1.8664, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.900908074949307e-05, |
|
"loss": 1.8684, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.8998167194054425e-05, |
|
"loss": 1.8525, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.8987197036873227e-05, |
|
"loss": 1.8582, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.897617034695576e-05, |
|
"loss": 1.8664, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.8965087193663906e-05, |
|
"loss": 1.8692, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.895394764671473e-05, |
|
"loss": 1.8534, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.894275177618004e-05, |
|
"loss": 1.852, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.893149965248592e-05, |
|
"loss": 1.8699, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.8920191346412326e-05, |
|
"loss": 1.8649, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.8908826929092607e-05, |
|
"loss": 1.857, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.8897406472013084e-05, |
|
"loss": 1.8404, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.8885930047012585e-05, |
|
"loss": 1.864, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.887439772628199e-05, |
|
"loss": 1.8578, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.886280958236379e-05, |
|
"loss": 1.8603, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.8851165688151627e-05, |
|
"loss": 1.8603, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.8839466116889823e-05, |
|
"loss": 1.8752, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.882771094217293e-05, |
|
"loss": 1.8628, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.8815900237945284e-05, |
|
"loss": 1.8575, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.8804034078500497e-05, |
|
"loss": 1.85, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.8792112538481025e-05, |
|
"loss": 1.8687, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.8780135692877693e-05, |
|
"loss": 1.8465, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.8768103617029213e-05, |
|
"loss": 1.8569, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.8756016386621712e-05, |
|
"loss": 1.8401, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.874387407768827e-05, |
|
"loss": 1.8356, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.873167676660842e-05, |
|
"loss": 1.8605, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.8719424530107674e-05, |
|
"loss": 1.8598, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.8707117445257067e-05, |
|
"loss": 1.8512, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.8694755589472633e-05, |
|
"loss": 1.8482, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.8682339040514933e-05, |
|
"loss": 1.8479, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.8669867876488578e-05, |
|
"loss": 1.8397, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.8657342175841722e-05, |
|
"loss": 1.8579, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.8644762017365576e-05, |
|
"loss": 1.8508, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.863212748019391e-05, |
|
"loss": 1.8335, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.861943864380255e-05, |
|
"loss": 1.8415, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.86066955880089e-05, |
|
"loss": 1.8543, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.85938983929714e-05, |
|
"loss": 1.861, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.858104713918907e-05, |
|
"loss": 1.8387, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.8568141907500964e-05, |
|
"loss": 1.8561, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.8555182779085678e-05, |
|
"loss": 1.8442, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.8542169835460846e-05, |
|
"loss": 1.8582, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.8529103158482605e-05, |
|
"loss": 1.8319, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.8515982830345115e-05, |
|
"loss": 1.8388, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.850280893358e-05, |
|
"loss": 1.8552, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.848958155105586e-05, |
|
"loss": 1.8317, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.847630076597774e-05, |
|
"loss": 1.8413, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.846296666188661e-05, |
|
"loss": 1.8251, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.8449579322658827e-05, |
|
"loss": 1.8445, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.8436138832505623e-05, |
|
"loss": 1.8672, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.842264527597257e-05, |
|
"loss": 1.8343, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.8409098737939038e-05, |
|
"loss": 1.8272, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.8395499303617677e-05, |
|
"loss": 1.8448, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.8381847058553872e-05, |
|
"loss": 1.835, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.8368142088625213e-05, |
|
"loss": 1.8356, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.8354384480040935e-05, |
|
"loss": 1.8175, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.83405743193414e-05, |
|
"loss": 1.8218, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.8326711693397537e-05, |
|
"loss": 1.8409, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.831279668941031e-05, |
|
"loss": 1.8471, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.8298829394910146e-05, |
|
"loss": 1.8708, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.82848098977564e-05, |
|
"loss": 1.8397, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.8270738286136815e-05, |
|
"loss": 1.8166, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.8256614648566937e-05, |
|
"loss": 1.8257, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.824243907388958e-05, |
|
"loss": 1.8483, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.8228211651274264e-05, |
|
"loss": 1.8235, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.8213932470216652e-05, |
|
"loss": 1.8561, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.8199601620537977e-05, |
|
"loss": 1.8324, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.81852191923845e-05, |
|
"loss": 1.8389, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.8170785276226915e-05, |
|
"loss": 1.8372, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.8156299962859805e-05, |
|
"loss": 1.8367, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.8141763343401057e-05, |
|
"loss": 1.8078, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.8127175509291292e-05, |
|
"loss": 1.8181, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.8112536552293286e-05, |
|
"loss": 1.8273, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.80978465644914e-05, |
|
"loss": 1.8302, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.8083105638291e-05, |
|
"loss": 1.8469, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.8068313866417876e-05, |
|
"loss": 1.8235, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.8053471341917636e-05, |
|
"loss": 1.8302, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.8038578158155163e-05, |
|
"loss": 1.8218, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.8023634408814e-05, |
|
"loss": 1.8322, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.8008640187895755e-05, |
|
"loss": 1.8091, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.7993595589719533e-05, |
|
"loss": 1.828, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.797850070892132e-05, |
|
"loss": 1.8188, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.7963355640453407e-05, |
|
"loss": 1.8106, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.7948160479583783e-05, |
|
"loss": 1.8172, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.793291532189553e-05, |
|
"loss": 1.8324, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.791762026328623e-05, |
|
"loss": 1.8202, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.7902275399967363e-05, |
|
"loss": 1.8183, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.78868808284637e-05, |
|
"loss": 1.8347, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.7871436645612685e-05, |
|
"loss": 1.831, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.785594294856385e-05, |
|
"loss": 1.8263, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.7840399834778176e-05, |
|
"loss": 1.847, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.7824807402027504e-05, |
|
"loss": 1.8249, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.78091657483939e-05, |
|
"loss": 1.8206, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.779347497226905e-05, |
|
"loss": 1.8251, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.777773517235364e-05, |
|
"loss": 1.8226, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.7761946447656736e-05, |
|
"loss": 1.8309, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.7746108897495157e-05, |
|
"loss": 1.8283, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.7730222621492846e-05, |
|
"loss": 1.8275, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.7714287719580254e-05, |
|
"loss": 1.8059, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.769830429199371e-05, |
|
"loss": 1.8235, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.7682272439274778e-05, |
|
"loss": 1.8104, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.766619226226965e-05, |
|
"loss": 1.8212, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.765006386212847e-05, |
|
"loss": 1.8269, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.763388734030475e-05, |
|
"loss": 1.8212, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.7617662798554685e-05, |
|
"loss": 1.8447, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.7601390338936547e-05, |
|
"loss": 1.8244, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.7585070063810014e-05, |
|
"loss": 1.8125, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.7568702075835557e-05, |
|
"loss": 1.8114, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.7552286477973766e-05, |
|
"loss": 1.8136, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.7535823373484716e-05, |
|
"loss": 1.8261, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.751931286592732e-05, |
|
"loss": 1.8085, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.7502755059158683e-05, |
|
"loss": 1.8297, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.7486150057333416e-05, |
|
"loss": 1.7937, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.7469497964903018e-05, |
|
"loss": 1.8052, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.7452798886615205e-05, |
|
"loss": 1.8216, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.7436052927513254e-05, |
|
"loss": 1.8322, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.741926019293533e-05, |
|
"loss": 1.8182, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.740242078851384e-05, |
|
"loss": 1.8262, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.7385534820174757e-05, |
|
"loss": 1.7948, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.7368602394136964e-05, |
|
"loss": 1.8332, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.735162361691157e-05, |
|
"loss": 1.8016, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.7334598595301257e-05, |
|
"loss": 1.8103, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.7317527436399603e-05, |
|
"loss": 1.8014, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.7300410247590402e-05, |
|
"loss": 1.8071, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.7283247136546996e-05, |
|
"loss": 1.809, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.7266038211231583e-05, |
|
"loss": 1.8236, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.724878357989457e-05, |
|
"loss": 1.8306, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.7231483351073858e-05, |
|
"loss": 1.8165, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.721413763359417e-05, |
|
"loss": 1.8162, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.7196746536566376e-05, |
|
"loss": 1.8346, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.71793101693868e-05, |
|
"loss": 1.8082, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.7161828641736527e-05, |
|
"loss": 1.8105, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.7144302063580726e-05, |
|
"loss": 1.8105, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.712673054516794e-05, |
|
"loss": 1.8232, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.7109114197029408e-05, |
|
"loss": 1.8227, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.7091453129978363e-05, |
|
"loss": 1.8181, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.7073747455109336e-05, |
|
"loss": 1.8006, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.7055997283797463e-05, |
|
"loss": 1.7975, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.7038202727697766e-05, |
|
"loss": 1.8105, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.7020363898744477e-05, |
|
"loss": 1.7994, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.7002480909150316e-05, |
|
"loss": 1.8193, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.6984553871405783e-05, |
|
"loss": 1.8347, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6966582898278466e-05, |
|
"loss": 1.8159, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.694856810281232e-05, |
|
"loss": 1.8053, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.6930509598326948e-05, |
|
"loss": 1.828, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.6912407498416914e-05, |
|
"loss": 1.8186, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.689426191695101e-05, |
|
"loss": 1.8027, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.6876072968071532e-05, |
|
"loss": 1.8098, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.6857840766193586e-05, |
|
"loss": 1.8129, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.6839565426004346e-05, |
|
"loss": 1.8054, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.6821247062462347e-05, |
|
"loss": 1.8123, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.6802885790796753e-05, |
|
"loss": 1.8074, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.678448172650664e-05, |
|
"loss": 1.7996, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.676603498536026e-05, |
|
"loss": 1.8098, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.6747545683394322e-05, |
|
"loss": 1.8016, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.672901393691325e-05, |
|
"loss": 1.8093, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.6710439862488478e-05, |
|
"loss": 1.8023, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.6691823576957676e-05, |
|
"loss": 1.8075, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.667316519742405e-05, |
|
"loss": 1.8052, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.6654464841255586e-05, |
|
"loss": 1.8011, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.663572262608433e-05, |
|
"loss": 1.8075, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.6616938669805622e-05, |
|
"loss": 1.7911, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.659811309057738e-05, |
|
"loss": 1.8026, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.6579246006819335e-05, |
|
"loss": 1.8088, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.6560337537212306e-05, |
|
"loss": 1.8155, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.6541387800697438e-05, |
|
"loss": 1.7997, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.6522396916475468e-05, |
|
"loss": 1.8253, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.650336500400595e-05, |
|
"loss": 1.8037, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.6484292183006542e-05, |
|
"loss": 1.8154, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.6465178573452214e-05, |
|
"loss": 1.8169, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.6446024295574522e-05, |
|
"loss": 1.8002, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.6426829469860837e-05, |
|
"loss": 1.7999, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.6407594217053587e-05, |
|
"loss": 1.7973, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.638831865814951e-05, |
|
"loss": 1.8073, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.6369002914398874e-05, |
|
"loss": 1.795, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.6349647107304724e-05, |
|
"loss": 1.7985, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.633025135862213e-05, |
|
"loss": 1.7936, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.6310815790357404e-05, |
|
"loss": 1.8036, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.6291340524767327e-05, |
|
"loss": 1.8046, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.6271825684358404e-05, |
|
"loss": 1.8052, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.625227139188607e-05, |
|
"loss": 1.8105, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.6232677770353936e-05, |
|
"loss": 1.7952, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.621304494301301e-05, |
|
"loss": 1.8102, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.6193373033360904e-05, |
|
"loss": 1.7962, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.6173662165141084e-05, |
|
"loss": 1.8078, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.6153912462342073e-05, |
|
"loss": 1.8051, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.6134124049196688e-05, |
|
"loss": 1.8057, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.6114297050181235e-05, |
|
"loss": 1.8153, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.6094431590014746e-05, |
|
"loss": 1.8047, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.6074527793658186e-05, |
|
"loss": 1.8069, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.605458578631367e-05, |
|
"loss": 1.7919, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.6034605693423676e-05, |
|
"loss": 1.8104, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.6014587640670244e-05, |
|
"loss": 1.7971, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.599453175397421e-05, |
|
"loss": 1.7987, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.597443815949439e-05, |
|
"loss": 1.8057, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.59543069836268e-05, |
|
"loss": 1.7817, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.5934138353003845e-05, |
|
"loss": 1.8009, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.5913932394493548e-05, |
|
"loss": 1.7939, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.589368923519874e-05, |
|
"loss": 1.8014, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.587340900245624e-05, |
|
"loss": 1.7879, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.5853091823836087e-05, |
|
"loss": 1.8, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.5832737827140727e-05, |
|
"loss": 1.7894, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.581234714040419e-05, |
|
"loss": 1.7845, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.5791919891891313e-05, |
|
"loss": 1.7841, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.5771456210096913e-05, |
|
"loss": 1.8057, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.5750956223744985e-05, |
|
"loss": 1.7961, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.5730420061787898e-05, |
|
"loss": 1.7908, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.5709847853405574e-05, |
|
"loss": 1.7888, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.568923972800468e-05, |
|
"loss": 1.7742, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.566859581521782e-05, |
|
"loss": 1.7902, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.5647916244902707e-05, |
|
"loss": 1.7918, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.5627201147141357e-05, |
|
"loss": 1.806, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.5606450652239263e-05, |
|
"loss": 1.7925, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.5585664890724584e-05, |
|
"loss": 1.7921, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.5564843993347313e-05, |
|
"loss": 1.7901, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.5543988091078467e-05, |
|
"loss": 1.7881, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.5523097315109245e-05, |
|
"loss": 1.7948, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.5502171796850226e-05, |
|
"loss": 1.7958, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.5481211667930528e-05, |
|
"loss": 1.7911, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.5460217060196986e-05, |
|
"loss": 1.7709, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.54391881057133e-05, |
|
"loss": 1.7914, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.541812493675925e-05, |
|
"loss": 1.8062, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.539702768582982e-05, |
|
"loss": 1.8074, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.5375896485634386e-05, |
|
"loss": 1.7788, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.5354731469095884e-05, |
|
"loss": 1.7814, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.5333532769349955e-05, |
|
"loss": 1.7854, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.5312300519744135e-05, |
|
"loss": 1.7869, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.529103485383699e-05, |
|
"loss": 1.7736, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.5269735905397278e-05, |
|
"loss": 1.7966, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.524840380840314e-05, |
|
"loss": 1.7907, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.5227038697041216e-05, |
|
"loss": 1.7767, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.520564070570582e-05, |
|
"loss": 1.7963, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.5184209968998098e-05, |
|
"loss": 1.7822, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.5162746621725176e-05, |
|
"loss": 1.7806, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.5141250798899307e-05, |
|
"loss": 1.7836, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.5119722635737035e-05, |
|
"loss": 1.7825, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.5098162267658323e-05, |
|
"loss": 1.7877, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.5076569830285736e-05, |
|
"loss": 1.791, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.5054945459443544e-05, |
|
"loss": 1.781, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.5033289291156905e-05, |
|
"loss": 1.7873, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.501160146165099e-05, |
|
"loss": 1.7963, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.498988210735013e-05, |
|
"loss": 1.794, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.4968131364876952e-05, |
|
"loss": 1.8001, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.4946349371051541e-05, |
|
"loss": 1.7728, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.4924536262890557e-05, |
|
"loss": 1.7732, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.4902692177606368e-05, |
|
"loss": 1.7822, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.4880817252606226e-05, |
|
"loss": 1.7862, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.4858911625491352e-05, |
|
"loss": 1.801, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.4836975434056102e-05, |
|
"loss": 1.8229, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.48150088162871e-05, |
|
"loss": 1.7954, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.4793011910362352e-05, |
|
"loss": 1.7996, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.4770984854650397e-05, |
|
"loss": 1.8033, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.4748927787709417e-05, |
|
"loss": 1.7883, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.4726840848286385e-05, |
|
"loss": 1.7939, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.4704724175316181e-05, |
|
"loss": 1.7975, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.4682577907920707e-05, |
|
"loss": 1.8029, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.4660402185408046e-05, |
|
"loss": 1.7807, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.4638197147271548e-05, |
|
"loss": 1.7953, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.4615962933188981e-05, |
|
"loss": 1.7902, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.4593699683021625e-05, |
|
"loss": 1.7849, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.4571407536813422e-05, |
|
"loss": 1.7814, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4549086634790075e-05, |
|
"loss": 1.7932, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.4526737117358167e-05, |
|
"loss": 1.789, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.4504359125104292e-05, |
|
"loss": 1.7828, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.4481952798794152e-05, |
|
"loss": 1.7876, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.4459518279371692e-05, |
|
"loss": 1.794, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.4437055707958184e-05, |
|
"loss": 1.7919, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.4414565225851371e-05, |
|
"loss": 1.7846, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4392046974524565e-05, |
|
"loss": 1.7843, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.4369501095625747e-05, |
|
"loss": 1.7726, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.4346927730976691e-05, |
|
"loss": 1.7836, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.4324327022572073e-05, |
|
"loss": 1.776, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.4301699112578557e-05, |
|
"loss": 1.7903, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.4279044143333926e-05, |
|
"loss": 1.7757, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.425636225734617e-05, |
|
"loss": 1.7705, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.42336535972926e-05, |
|
"loss": 1.8011, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4210918306018937e-05, |
|
"loss": 1.7795, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.4188156526538435e-05, |
|
"loss": 1.7965, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.4165368402030952e-05, |
|
"loss": 1.7631, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4142554075842083e-05, |
|
"loss": 1.7949, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.4119713691482228e-05, |
|
"loss": 1.785, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.4096847392625708e-05, |
|
"loss": 1.777, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.4073955323109859e-05, |
|
"loss": 1.779, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.4051037626934112e-05, |
|
"loss": 1.7815, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.4028094448259113e-05, |
|
"loss": 1.7852, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.4005125931405792e-05, |
|
"loss": 1.7999, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.3982132220854472e-05, |
|
"loss": 1.791, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.3959113461243952e-05, |
|
"loss": 1.7836, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.3936069797370591e-05, |
|
"loss": 1.778, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.3913001374187421e-05, |
|
"loss": 1.8065, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.3889908336803198e-05, |
|
"loss": 1.8035, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.3866790830481529e-05, |
|
"loss": 1.7789, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.3843649000639933e-05, |
|
"loss": 1.7706, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.3820482992848929e-05, |
|
"loss": 1.7685, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.3797292952831127e-05, |
|
"loss": 1.7687, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.3774079026460308e-05, |
|
"loss": 1.7768, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.3750841359760511e-05, |
|
"loss": 1.7878, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.37275800989051e-05, |
|
"loss": 1.792, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.3704295390215868e-05, |
|
"loss": 1.7822, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.3680987380162095e-05, |
|
"loss": 1.7831, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.3657656215359634e-05, |
|
"loss": 1.7819, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.3634302042569995e-05, |
|
"loss": 1.7839, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.3610925008699413e-05, |
|
"loss": 1.7905, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.3587525260797934e-05, |
|
"loss": 1.7785, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.3564102946058468e-05, |
|
"loss": 1.7846, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.3540658211815898e-05, |
|
"loss": 1.7841, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.3517191205546121e-05, |
|
"loss": 1.774, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.3493702074865139e-05, |
|
"loss": 1.7947, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.3470190967528118e-05, |
|
"loss": 1.7843, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.3446658031428474e-05, |
|
"loss": 1.7796, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.3423103414596929e-05, |
|
"loss": 1.7713, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.3399527265200581e-05, |
|
"loss": 1.7769, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.3375929731541986e-05, |
|
"loss": 1.7823, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.3352310962058202e-05, |
|
"loss": 1.7642, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.332867110531988e-05, |
|
"loss": 1.7841, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.3305010310030311e-05, |
|
"loss": 1.7897, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.3281328725024496e-05, |
|
"loss": 1.7813, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.3257626499268217e-05, |
|
"loss": 1.7828, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.3233903781857084e-05, |
|
"loss": 1.7809, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.3210160722015619e-05, |
|
"loss": 1.7768, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.3186397469096295e-05, |
|
"loss": 1.7816, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.3162614172578614e-05, |
|
"loss": 1.7741, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.3138810982068154e-05, |
|
"loss": 1.7801, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.3114988047295638e-05, |
|
"loss": 1.7711, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.3091145518115982e-05, |
|
"loss": 1.7807, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.3067283544507366e-05, |
|
"loss": 1.7835, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.3043402276570276e-05, |
|
"loss": 1.7746, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.3019501864526565e-05, |
|
"loss": 1.7742, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.2995582458718518e-05, |
|
"loss": 1.7811, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.2971644209607893e-05, |
|
"loss": 1.7684, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.2947687267774973e-05, |
|
"loss": 1.7778, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.2923711783917637e-05, |
|
"loss": 1.7587, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.2899717908850385e-05, |
|
"loss": 1.784, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.2875705793503424e-05, |
|
"loss": 1.773, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.2851675588921677e-05, |
|
"loss": 1.7721, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.2827627446263877e-05, |
|
"loss": 1.7781, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.2803561516801575e-05, |
|
"loss": 1.7935, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.2779477951918217e-05, |
|
"loss": 1.7746, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.2755376903108183e-05, |
|
"loss": 1.7783, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.2731258521975829e-05, |
|
"loss": 1.7812, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.2707122960234544e-05, |
|
"loss": 1.7742, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.2682970369705773e-05, |
|
"loss": 1.7585, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.2658800902318103e-05, |
|
"loss": 1.7848, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.2634614710106266e-05, |
|
"loss": 1.7784, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.2610411945210199e-05, |
|
"loss": 1.7762, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.2586192759874094e-05, |
|
"loss": 1.7686, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.2561957306445428e-05, |
|
"loss": 1.7861, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.253770573737402e-05, |
|
"loss": 1.7744, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.2513438205211048e-05, |
|
"loss": 1.7703, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.2489154862608111e-05, |
|
"loss": 1.7785, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.2464855862316263e-05, |
|
"loss": 1.7789, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.244054135718505e-05, |
|
"loss": 1.7766, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.2416211500161546e-05, |
|
"loss": 1.7805, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.2391866444289394e-05, |
|
"loss": 1.7769, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.2367506342707851e-05, |
|
"loss": 1.7727, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.2343131348650806e-05, |
|
"loss": 1.7603, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.231874161544583e-05, |
|
"loss": 1.7681, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.2294337296513219e-05, |
|
"loss": 1.7705, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.2269918545365e-05, |
|
"loss": 1.7692, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.2245485515604004e-05, |
|
"loss": 1.7685, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.2221038360922863e-05, |
|
"loss": 1.7873, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.219657723510307e-05, |
|
"loss": 1.779, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.2172102292013994e-05, |
|
"loss": 1.7963, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.2147613685611928e-05, |
|
"loss": 1.7737, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.212311156993911e-05, |
|
"loss": 1.7578, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.2098596099122745e-05, |
|
"loss": 1.7649, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.2074067427374068e-05, |
|
"loss": 1.782, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.2049525708987331e-05, |
|
"loss": 1.7729, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.2024971098338868e-05, |
|
"loss": 1.7769, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.2000403749886108e-05, |
|
"loss": 1.7761, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.1975823818166596e-05, |
|
"loss": 1.7476, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.1951231457797047e-05, |
|
"loss": 1.7814, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.1926626823472338e-05, |
|
"loss": 1.7691, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.1902010069964569e-05, |
|
"loss": 1.7756, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.1877381352122064e-05, |
|
"loss": 1.7833, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.1852740824868416e-05, |
|
"loss": 1.7659, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.1828088643201492e-05, |
|
"loss": 1.772, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.180342496219248e-05, |
|
"loss": 1.7516, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.17787499369849e-05, |
|
"loss": 1.7647, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.1754063722793624e-05, |
|
"loss": 1.769, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.1729366474903923e-05, |
|
"loss": 1.7813, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.1704658348670455e-05, |
|
"loss": 1.7669, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.1679939499516317e-05, |
|
"loss": 1.7846, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.165521008293206e-05, |
|
"loss": 1.7719, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.1630470254474697e-05, |
|
"loss": 1.7625, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.1605720169766752e-05, |
|
"loss": 1.7721, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.1580959984495243e-05, |
|
"loss": 1.7558, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.1556189854410744e-05, |
|
"loss": 1.7633, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.1531409935326377e-05, |
|
"loss": 1.7632, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.1506620383116835e-05, |
|
"loss": 1.7925, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.1481821353717418e-05, |
|
"loss": 1.7667, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.145701300312303e-05, |
|
"loss": 1.7733, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.1432195487387223e-05, |
|
"loss": 1.7772, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.1407368962621184e-05, |
|
"loss": 1.7459, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.1382533584992783e-05, |
|
"loss": 1.7608, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.1357689510725571e-05, |
|
"loss": 1.749, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.1332836896097808e-05, |
|
"loss": 1.77, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.1307975897441473e-05, |
|
"loss": 1.7676, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.1283106671141282e-05, |
|
"loss": 1.7755, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.1258229373633713e-05, |
|
"loss": 1.7742, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.1233344161406008e-05, |
|
"loss": 1.7606, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.12084511909952e-05, |
|
"loss": 1.7749, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.1183550618987118e-05, |
|
"loss": 1.7868, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.1158642602015415e-05, |
|
"loss": 1.7712, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.1133727296760572e-05, |
|
"loss": 1.7732, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.110880485994891e-05, |
|
"loss": 1.7672, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.1083875448351626e-05, |
|
"loss": 1.7858, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.1058939218783772e-05, |
|
"loss": 1.7683, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.10339963281033e-05, |
|
"loss": 1.7813, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.100904693321006e-05, |
|
"loss": 1.7745, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0984091191044816e-05, |
|
"loss": 1.7848, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.0959129258588257e-05, |
|
"loss": 1.7518, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.0934161292860008e-05, |
|
"loss": 1.7768, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.0909187450917656e-05, |
|
"loss": 1.7602, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.0884207889855735e-05, |
|
"loss": 1.758, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.0859222766804778e-05, |
|
"loss": 1.7761, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.0834232238930283e-05, |
|
"loss": 1.7606, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.0809236463431754e-05, |
|
"loss": 1.779, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.0784235597541708e-05, |
|
"loss": 1.771, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.075922979852468e-05, |
|
"loss": 1.7654, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.073421922367623e-05, |
|
"loss": 1.7758, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.0709204030321972e-05, |
|
"loss": 1.7592, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.068418437581656e-05, |
|
"loss": 1.7741, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.0659160417542721e-05, |
|
"loss": 1.759, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.0634132312910245e-05, |
|
"loss": 1.7809, |
|
"step": 12020 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.060910021935501e-05, |
|
"loss": 1.7811, |
|
"step": 12040 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.0584064294337983e-05, |
|
"loss": 1.761, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.0559024695344233e-05, |
|
"loss": 1.7515, |
|
"step": 12080 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.0533981579881938e-05, |
|
"loss": 1.7861, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.0508935105481402e-05, |
|
"loss": 1.7643, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.0483885429694051e-05, |
|
"loss": 1.7745, |
|
"step": 12140 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.0458832710091448e-05, |
|
"loss": 1.7539, |
|
"step": 12160 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.0433777104264313e-05, |
|
"loss": 1.7546, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.0408718769821512e-05, |
|
"loss": 1.7606, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.0383657864389077e-05, |
|
"loss": 1.7583, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.0358594545609207e-05, |
|
"loss": 1.7659, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.0333528971139297e-05, |
|
"loss": 1.7601, |
|
"step": 12260 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.0308461298650923e-05, |
|
"loss": 1.7612, |
|
"step": 12280 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0283391685828844e-05, |
|
"loss": 1.7646, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.0258320290370051e-05, |
|
"loss": 1.7741, |
|
"step": 12320 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.0233247269982732e-05, |
|
"loss": 1.7616, |
|
"step": 12340 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.0208172782385295e-05, |
|
"loss": 1.7502, |
|
"step": 12360 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.0183096985305385e-05, |
|
"loss": 1.7806, |
|
"step": 12380 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.0158020036478881e-05, |
|
"loss": 1.7728, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0132942093648905e-05, |
|
"loss": 1.7748, |
|
"step": 12420 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.0107863314564834e-05, |
|
"loss": 1.7669, |
|
"step": 12440 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.0082783856981306e-05, |
|
"loss": 1.765, |
|
"step": 12460 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.0057703878657227e-05, |
|
"loss": 1.7704, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0032623537354775e-05, |
|
"loss": 1.7509, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0007542990838413e-05, |
|
"loss": 1.7584, |
|
"step": 12520 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.982462396873895e-06, |
|
"loss": 1.7479, |
|
"step": 12540 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.95738191322728e-06, |
|
"loss": 1.7456, |
|
"step": 12560 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.93230169766392e-06, |
|
"loss": 1.7532, |
|
"step": 12580 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 9.907221907947489e-06, |
|
"loss": 1.7547, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.882142701838986e-06, |
|
"loss": 1.7646, |
|
"step": 12620 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.85706423709573e-06, |
|
"loss": 1.7707, |
|
"step": 12640 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.83198667147038e-06, |
|
"loss": 1.772, |
|
"step": 12660 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 9.80691016270994e-06, |
|
"loss": 1.7501, |
|
"step": 12680 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.781834868554763e-06, |
|
"loss": 1.7525, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.756760946737572e-06, |
|
"loss": 1.7504, |
|
"step": 12720 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.731688554982446e-06, |
|
"loss": 1.7504, |
|
"step": 12740 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.706617851003837e-06, |
|
"loss": 1.75, |
|
"step": 12760 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.681548992505594e-06, |
|
"loss": 1.7557, |
|
"step": 12780 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 9.65648213717995e-06, |
|
"loss": 1.7724, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.63141744270653e-06, |
|
"loss": 1.7697, |
|
"step": 12820 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.606355066751382e-06, |
|
"loss": 1.7447, |
|
"step": 12840 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 9.581295166965956e-06, |
|
"loss": 1.7531, |
|
"step": 12860 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.556237900986128e-06, |
|
"loss": 1.776, |
|
"step": 12880 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 9.531183426431217e-06, |
|
"loss": 1.758, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 9.506131900902972e-06, |
|
"loss": 1.7682, |
|
"step": 12920 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.481083481984593e-06, |
|
"loss": 1.7616, |
|
"step": 12940 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.456038327239744e-06, |
|
"loss": 1.7584, |
|
"step": 12960 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.430996594211547e-06, |
|
"loss": 1.7752, |
|
"step": 12980 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.75, |
|
"learning_rate": 9.405958440421613e-06, |
|
"loss": 1.7765, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 9.380924023369027e-06, |
|
"loss": 1.7521, |
|
"step": 13020 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.355893500529369e-06, |
|
"loss": 1.7598, |
|
"step": 13040 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.330867029353732e-06, |
|
"loss": 1.7406, |
|
"step": 13060 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.305844767267716e-06, |
|
"loss": 1.7665, |
|
"step": 13080 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.280826871670441e-06, |
|
"loss": 1.7629, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.255813499933573e-06, |
|
"loss": 1.7617, |
|
"step": 13120 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 9.230804809400304e-06, |
|
"loss": 1.7504, |
|
"step": 13140 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.20580095738439e-06, |
|
"loss": 1.7582, |
|
"step": 13160 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.180802101169153e-06, |
|
"loss": 1.759, |
|
"step": 13180 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 9.155808398006487e-06, |
|
"loss": 1.7688, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.130820005115863e-06, |
|
"loss": 1.7675, |
|
"step": 13220 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.10583707968336e-06, |
|
"loss": 1.7683, |
|
"step": 13240 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.080859778860662e-06, |
|
"loss": 1.7856, |
|
"step": 13260 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.055888259764066e-06, |
|
"loss": 1.7773, |
|
"step": 13280 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.030922679473512e-06, |
|
"loss": 1.7512, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.005963195031566e-06, |
|
"loss": 1.7544, |
|
"step": 13320 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 8.981009963442464e-06, |
|
"loss": 1.7731, |
|
"step": 13340 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 8.956063141671103e-06, |
|
"loss": 1.7636, |
|
"step": 13360 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.931122886642058e-06, |
|
"loss": 1.7437, |
|
"step": 13380 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.906189355238602e-06, |
|
"loss": 1.7513, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.881262704301709e-06, |
|
"loss": 1.7474, |
|
"step": 13420 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 8.856343090629074e-06, |
|
"loss": 1.7673, |
|
"step": 13440 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 8.831430670974126e-06, |
|
"loss": 1.7671, |
|
"step": 13460 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 8.806525602045043e-06, |
|
"loss": 1.7584, |
|
"step": 13480 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 8.781628040503758e-06, |
|
"loss": 1.7684, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 8.756738142964985e-06, |
|
"loss": 1.7607, |
|
"step": 13520 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 8.731856065995229e-06, |
|
"loss": 1.7683, |
|
"step": 13540 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.706981966111791e-06, |
|
"loss": 1.7517, |
|
"step": 13560 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.682115999781814e-06, |
|
"loss": 1.7554, |
|
"step": 13580 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 8.657258323421253e-06, |
|
"loss": 1.7452, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.632409093393938e-06, |
|
"loss": 1.7385, |
|
"step": 13620 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.607568466010556e-06, |
|
"loss": 1.7716, |
|
"step": 13640 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.582736597527673e-06, |
|
"loss": 1.7491, |
|
"step": 13660 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.557913644146785e-06, |
|
"loss": 1.7569, |
|
"step": 13680 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 8.533099762013281e-06, |
|
"loss": 1.7498, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.5082951072155e-06, |
|
"loss": 1.7629, |
|
"step": 13720 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.483499835783743e-06, |
|
"loss": 1.7664, |
|
"step": 13740 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.625, |
|
"learning_rate": 8.45871410368928e-06, |
|
"loss": 1.7748, |
|
"step": 13760 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 8.433938066843367e-06, |
|
"loss": 1.7368, |
|
"step": 13780 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.409171881096292e-06, |
|
"loss": 1.7764, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.384415702236363e-06, |
|
"loss": 1.7538, |
|
"step": 13820 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 8.359669685988939e-06, |
|
"loss": 1.7678, |
|
"step": 13840 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 8.334933988015465e-06, |
|
"loss": 1.7644, |
|
"step": 13860 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.31020876391247e-06, |
|
"loss": 1.7725, |
|
"step": 13880 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.285494169210597e-06, |
|
"loss": 1.7637, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 8.26079035937364e-06, |
|
"loss": 1.7472, |
|
"step": 13920 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 8.23609748979753e-06, |
|
"loss": 1.7796, |
|
"step": 13940 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.211415715809407e-06, |
|
"loss": 1.7416, |
|
"step": 13960 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.186745192666592e-06, |
|
"loss": 1.7395, |
|
"step": 13980 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 8.162086075555645e-06, |
|
"loss": 1.7577, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 8.13743851959138e-06, |
|
"loss": 1.7681, |
|
"step": 14020 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 8.11280267981588e-06, |
|
"loss": 1.7549, |
|
"step": 14040 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 8.088178711197533e-06, |
|
"loss": 1.7603, |
|
"step": 14060 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 8.063566768630052e-06, |
|
"loss": 1.7712, |
|
"step": 14080 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.038967006931508e-06, |
|
"loss": 1.7618, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.014379580843333e-06, |
|
"loss": 1.7589, |
|
"step": 14120 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.989804645029386e-06, |
|
"loss": 1.7535, |
|
"step": 14140 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 7.96524235407494e-06, |
|
"loss": 1.7566, |
|
"step": 14160 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.940692862485735e-06, |
|
"loss": 1.75, |
|
"step": 14180 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 7.916156324687e-06, |
|
"loss": 1.7601, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.89163289502247e-06, |
|
"loss": 1.7576, |
|
"step": 14220 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.867122727753442e-06, |
|
"loss": 1.7725, |
|
"step": 14240 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 7.84262597705777e-06, |
|
"loss": 1.7552, |
|
"step": 14260 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.818142797028922e-06, |
|
"loss": 1.765, |
|
"step": 14280 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.793673341675004e-06, |
|
"loss": 1.7469, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 7.769217764917782e-06, |
|
"loss": 1.776, |
|
"step": 14320 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 7.744776220591718e-06, |
|
"loss": 1.7531, |
|
"step": 14340 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 7.720348862443022e-06, |
|
"loss": 1.7494, |
|
"step": 14360 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.625, |
|
"learning_rate": 7.69593584412865e-06, |
|
"loss": 1.7567, |
|
"step": 14380 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.671537319215358e-06, |
|
"loss": 1.7583, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.625, |
|
"learning_rate": 7.647153441178745e-06, |
|
"loss": 1.7573, |
|
"step": 14420 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.622784363402261e-06, |
|
"loss": 1.7606, |
|
"step": 14440 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.5, |
|
"learning_rate": 7.598430239176264e-06, |
|
"loss": 1.7359, |
|
"step": 14460 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.574091221697055e-06, |
|
"loss": 1.7503, |
|
"step": 14480 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.549767464065895e-06, |
|
"loss": 1.7486, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.525459119288059e-06, |
|
"loss": 1.7441, |
|
"step": 14520 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.501166340271878e-06, |
|
"loss": 1.7643, |
|
"step": 14540 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.476889279827759e-06, |
|
"loss": 1.7584, |
|
"step": 14560 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 7.452628090667242e-06, |
|
"loss": 1.7749, |
|
"step": 14580 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.42838292540202e-06, |
|
"loss": 1.7659, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 7.404153936542997e-06, |
|
"loss": 1.739, |
|
"step": 14620 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 7.379941276499323e-06, |
|
"loss": 1.7443, |
|
"step": 14640 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 7.355745097577431e-06, |
|
"loss": 1.7476, |
|
"step": 14660 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 7.331565551980078e-06, |
|
"loss": 1.7513, |
|
"step": 14680 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 7.307402791805398e-06, |
|
"loss": 1.7508, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.283256969045937e-06, |
|
"loss": 1.7515, |
|
"step": 14720 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.259128235587692e-06, |
|
"loss": 1.7577, |
|
"step": 14740 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 7.235016743209178e-06, |
|
"loss": 1.7483, |
|
"step": 14760 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.625, |
|
"learning_rate": 7.210922643580436e-06, |
|
"loss": 1.7603, |
|
"step": 14780 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 7.186846088262114e-06, |
|
"loss": 1.7553, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 7.162787228704499e-06, |
|
"loss": 1.744, |
|
"step": 14820 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 7.138746216246565e-06, |
|
"loss": 1.7731, |
|
"step": 14840 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 7.114723202115013e-06, |
|
"loss": 1.7563, |
|
"step": 14860 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.090718337423339e-06, |
|
"loss": 1.7579, |
|
"step": 14880 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.75, |
|
"learning_rate": 7.066731773170865e-06, |
|
"loss": 1.7448, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 7.042763660241805e-06, |
|
"loss": 1.7659, |
|
"step": 14920 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.018814149404298e-06, |
|
"loss": 1.753, |
|
"step": 14940 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 6.99488339130947e-06, |
|
"loss": 1.7511, |
|
"step": 14960 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 6.970971536490496e-06, |
|
"loss": 1.7691, |
|
"step": 14980 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 6.947078735361628e-06, |
|
"loss": 1.7498, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 6.923205138217271e-06, |
|
"loss": 1.7428, |
|
"step": 15020 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.8993508952310365e-06, |
|
"loss": 1.7465, |
|
"step": 15040 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 6.875516156454776e-06, |
|
"loss": 1.7692, |
|
"step": 15060 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 6.851701071817662e-06, |
|
"loss": 1.7492, |
|
"step": 15080 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 6.827905791125235e-06, |
|
"loss": 1.7673, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.804130464058465e-06, |
|
"loss": 1.75, |
|
"step": 15120 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.375, |
|
"learning_rate": 6.780375240172792e-06, |
|
"loss": 1.7489, |
|
"step": 15140 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 6.756640268897217e-06, |
|
"loss": 1.7649, |
|
"step": 15160 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 6.732925699533331e-06, |
|
"loss": 1.7566, |
|
"step": 15180 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 6.709231681254402e-06, |
|
"loss": 1.7516, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 6.685558363104419e-06, |
|
"loss": 1.7658, |
|
"step": 15220 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 6.661905893997149e-06, |
|
"loss": 1.774, |
|
"step": 15240 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 6.63827442271523e-06, |
|
"loss": 1.7388, |
|
"step": 15260 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.6146640979092035e-06, |
|
"loss": 1.7533, |
|
"step": 15280 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 6.591075068096588e-06, |
|
"loss": 1.7614, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.625, |
|
"learning_rate": 6.567507481660971e-06, |
|
"loss": 1.7339, |
|
"step": 15320 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.543961486851026e-06, |
|
"loss": 1.7646, |
|
"step": 15340 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.520437231779621e-06, |
|
"loss": 1.7514, |
|
"step": 15360 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.496934864422876e-06, |
|
"loss": 1.7665, |
|
"step": 15380 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 6.473454532619223e-06, |
|
"loss": 1.754, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 6.449996384068482e-06, |
|
"loss": 1.7539, |
|
"step": 15420 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 6.426560566330937e-06, |
|
"loss": 1.7595, |
|
"step": 15440 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.403147226826403e-06, |
|
"loss": 1.7466, |
|
"step": 15460 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 6.379756512833288e-06, |
|
"loss": 1.7639, |
|
"step": 15480 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 6.356388571487696e-06, |
|
"loss": 1.7477, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 6.333043549782465e-06, |
|
"loss": 1.7563, |
|
"step": 15520 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 6.309721594566271e-06, |
|
"loss": 1.7366, |
|
"step": 15540 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 6.2864228525426914e-06, |
|
"loss": 1.7691, |
|
"step": 15560 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.263147470269275e-06, |
|
"loss": 1.7686, |
|
"step": 15580 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.239895594156649e-06, |
|
"loss": 1.7474, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 6.216667370467558e-06, |
|
"loss": 1.7609, |
|
"step": 15620 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.193462945315974e-06, |
|
"loss": 1.7503, |
|
"step": 15640 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 6.1702824646661685e-06, |
|
"loss": 1.7608, |
|
"step": 15660 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 6.147126074331788e-06, |
|
"loss": 1.7577, |
|
"step": 15680 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.123993919974947e-06, |
|
"loss": 1.7551, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 6.100886147105305e-06, |
|
"loss": 1.7545, |
|
"step": 15720 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 6.077802901079155e-06, |
|
"loss": 1.7639, |
|
"step": 15740 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.054744327098498e-06, |
|
"loss": 1.7582, |
|
"step": 15760 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.031710570210157e-06, |
|
"loss": 1.7632, |
|
"step": 15780 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 6.008701775304827e-06, |
|
"loss": 1.7465, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.985718087116197e-06, |
|
"loss": 1.7537, |
|
"step": 15820 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.96275965022002e-06, |
|
"loss": 1.757, |
|
"step": 15840 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.939826609033203e-06, |
|
"loss": 1.759, |
|
"step": 15860 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 5.916919107812924e-06, |
|
"loss": 1.7539, |
|
"step": 15880 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.894037290655683e-06, |
|
"loss": 1.7741, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.871181301496427e-06, |
|
"loss": 1.7582, |
|
"step": 15920 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.848351284107644e-06, |
|
"loss": 1.7656, |
|
"step": 15940 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 5.825547382098435e-06, |
|
"loss": 1.7577, |
|
"step": 15960 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.802769738913632e-06, |
|
"loss": 1.7544, |
|
"step": 15980 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.780018497832901e-06, |
|
"loss": 1.7667, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5.757293801969808e-06, |
|
"loss": 1.7478, |
|
"step": 16020 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.7345957942709505e-06, |
|
"loss": 1.7502, |
|
"step": 16040 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.7119246175150555e-06, |
|
"loss": 1.7631, |
|
"step": 16060 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.689280414312066e-06, |
|
"loss": 1.741, |
|
"step": 16080 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5.666663327102238e-06, |
|
"loss": 1.7412, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.644073498155287e-06, |
|
"loss": 1.7629, |
|
"step": 16120 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.375, |
|
"learning_rate": 5.6215110695694405e-06, |
|
"loss": 1.7418, |
|
"step": 16140 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 5.598976183270579e-06, |
|
"loss": 1.7543, |
|
"step": 16160 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.576468981011327e-06, |
|
"loss": 1.7464, |
|
"step": 16180 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 5.553989604370169e-06, |
|
"loss": 1.7567, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 5.5315381947505565e-06, |
|
"loss": 1.7528, |
|
"step": 16220 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 5.509114893380016e-06, |
|
"loss": 1.7407, |
|
"step": 16240 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.625, |
|
"learning_rate": 5.48671984130926e-06, |
|
"loss": 1.7652, |
|
"step": 16260 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 5.46435317941132e-06, |
|
"loss": 1.7695, |
|
"step": 16280 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 5.442015048380617e-06, |
|
"loss": 1.7443, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 5.419705588732115e-06, |
|
"loss": 1.7552, |
|
"step": 16320 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 5.3974249408004364e-06, |
|
"loss": 1.7586, |
|
"step": 16340 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.3751732447389445e-06, |
|
"loss": 1.785, |
|
"step": 16360 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.3529506405188965e-06, |
|
"loss": 1.7338, |
|
"step": 16380 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.33075726792856e-06, |
|
"loss": 1.7437, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.308593266572309e-06, |
|
"loss": 1.7614, |
|
"step": 16420 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.286458775869768e-06, |
|
"loss": 1.7477, |
|
"step": 16440 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.264353935054935e-06, |
|
"loss": 1.7534, |
|
"step": 16460 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.2422788831752955e-06, |
|
"loss": 1.7556, |
|
"step": 16480 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.220233759090939e-06, |
|
"loss": 1.7572, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.75, |
|
"learning_rate": 5.19821870147372e-06, |
|
"loss": 1.7439, |
|
"step": 16520 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.176233848806349e-06, |
|
"loss": 1.7529, |
|
"step": 16540 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.154279339381543e-06, |
|
"loss": 1.7456, |
|
"step": 16560 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.132355311301145e-06, |
|
"loss": 1.7487, |
|
"step": 16580 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.110461902475261e-06, |
|
"loss": 1.7541, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.088599250621393e-06, |
|
"loss": 1.7484, |
|
"step": 16620 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 5.066767493263568e-06, |
|
"loss": 1.7639, |
|
"step": 16640 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 5.044966767731474e-06, |
|
"loss": 1.7654, |
|
"step": 16660 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.0231972111596e-06, |
|
"loss": 1.7514, |
|
"step": 16680 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.001458960486372e-06, |
|
"loss": 1.756, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 4.979752152453287e-06, |
|
"loss": 1.756, |
|
"step": 16720 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 4.958076923604055e-06, |
|
"loss": 1.7605, |
|
"step": 16740 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 4.936433410283754e-06, |
|
"loss": 1.7346, |
|
"step": 16760 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.914821748637938e-06, |
|
"loss": 1.7629, |
|
"step": 16780 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.8932420746118246e-06, |
|
"loss": 1.7526, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.871694523949404e-06, |
|
"loss": 1.7639, |
|
"step": 16820 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.850179232192603e-06, |
|
"loss": 1.7388, |
|
"step": 16840 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.828696334680428e-06, |
|
"loss": 1.7452, |
|
"step": 16860 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.807245966548113e-06, |
|
"loss": 1.767, |
|
"step": 16880 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.785828262726271e-06, |
|
"loss": 1.7547, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 4.764443357940044e-06, |
|
"loss": 1.7462, |
|
"step": 16920 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.743091386708257e-06, |
|
"loss": 1.7567, |
|
"step": 16940 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 4.721772483342573e-06, |
|
"loss": 1.7352, |
|
"step": 16960 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.625, |
|
"learning_rate": 4.700486781946639e-06, |
|
"loss": 1.754, |
|
"step": 16980 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.679234416415258e-06, |
|
"loss": 1.735, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.65801552043353e-06, |
|
"loss": 1.7673, |
|
"step": 17020 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 4.636830227476033e-06, |
|
"loss": 1.743, |
|
"step": 17040 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.61567867080595e-06, |
|
"loss": 1.7607, |
|
"step": 17060 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.594560983474269e-06, |
|
"loss": 1.7553, |
|
"step": 17080 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.625, |
|
"learning_rate": 4.5734772983189206e-06, |
|
"loss": 1.7538, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.552427747963937e-06, |
|
"loss": 1.7566, |
|
"step": 17120 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.531412464818654e-06, |
|
"loss": 1.7531, |
|
"step": 17140 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.510431581076837e-06, |
|
"loss": 1.7236, |
|
"step": 17160 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.489485228715872e-06, |
|
"loss": 1.7459, |
|
"step": 17180 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.468573539495928e-06, |
|
"loss": 1.766, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.447696644959135e-06, |
|
"loss": 1.7471, |
|
"step": 17220 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.4268546764287455e-06, |
|
"loss": 1.7713, |
|
"step": 17240 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.406047765008319e-06, |
|
"loss": 1.7543, |
|
"step": 17260 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.385276041580892e-06, |
|
"loss": 1.7526, |
|
"step": 17280 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.3645396368081496e-06, |
|
"loss": 1.7675, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.34383868112963e-06, |
|
"loss": 1.7483, |
|
"step": 17320 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.323173304761856e-06, |
|
"loss": 1.7331, |
|
"step": 17340 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.302543637697558e-06, |
|
"loss": 1.7604, |
|
"step": 17360 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.281949809704852e-06, |
|
"loss": 1.7468, |
|
"step": 17380 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.2613919503263866e-06, |
|
"loss": 1.761, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.240870188878582e-06, |
|
"loss": 1.7562, |
|
"step": 17420 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.220384654450774e-06, |
|
"loss": 1.7714, |
|
"step": 17440 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.19993547590442e-06, |
|
"loss": 1.7442, |
|
"step": 17460 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.179522781872286e-06, |
|
"loss": 1.7561, |
|
"step": 17480 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 4.159146700757639e-06, |
|
"loss": 1.762, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.138807360733435e-06, |
|
"loss": 1.7431, |
|
"step": 17520 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.118504889741518e-06, |
|
"loss": 1.7519, |
|
"step": 17540 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.098239415491808e-06, |
|
"loss": 1.7469, |
|
"step": 17560 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.078011065461507e-06, |
|
"loss": 1.7454, |
|
"step": 17580 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.057819966894288e-06, |
|
"loss": 1.7447, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.037666246799502e-06, |
|
"loss": 1.7611, |
|
"step": 17620 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.0175500319513704e-06, |
|
"loss": 1.749, |
|
"step": 17640 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 3.997471448888207e-06, |
|
"loss": 1.7534, |
|
"step": 17660 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.977430623911588e-06, |
|
"loss": 1.7399, |
|
"step": 17680 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.957427683085588e-06, |
|
"loss": 1.7551, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.937462752235981e-06, |
|
"loss": 1.7442, |
|
"step": 17720 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.917535956949439e-06, |
|
"loss": 1.7553, |
|
"step": 17740 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.897647422572744e-06, |
|
"loss": 1.7473, |
|
"step": 17760 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.877797274212012e-06, |
|
"loss": 1.7446, |
|
"step": 17780 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.857985636731887e-06, |
|
"loss": 1.7519, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.838212634754772e-06, |
|
"loss": 1.755, |
|
"step": 17820 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.818478392660039e-06, |
|
"loss": 1.7453, |
|
"step": 17840 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.798783034583241e-06, |
|
"loss": 1.7587, |
|
"step": 17860 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.779126684415343e-06, |
|
"loss": 1.7592, |
|
"step": 17880 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.7595094658019302e-06, |
|
"loss": 1.7417, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.7399315021424363e-06, |
|
"loss": 1.76, |
|
"step": 17920 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.7203929165893805e-06, |
|
"loss": 1.7637, |
|
"step": 17940 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.7008938320475563e-06, |
|
"loss": 1.7419, |
|
"step": 17960 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 3.6814343711732948e-06, |
|
"loss": 1.7552, |
|
"step": 17980 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.6620146563736847e-06, |
|
"loss": 1.7463, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.6426348098057897e-06, |
|
"loss": 1.7503, |
|
"step": 18020 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.6232949533758864e-06, |
|
"loss": 1.7493, |
|
"step": 18040 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.6039952087387043e-06, |
|
"loss": 1.7564, |
|
"step": 18060 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.584735697296651e-06, |
|
"loss": 1.7506, |
|
"step": 18080 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.5655165401990564e-06, |
|
"loss": 1.7479, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.546337858341403e-06, |
|
"loss": 1.75, |
|
"step": 18120 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.527199772364572e-06, |
|
"loss": 1.7611, |
|
"step": 18140 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.508102402654082e-06, |
|
"loss": 1.7647, |
|
"step": 18160 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.4890458693393305e-06, |
|
"loss": 1.7709, |
|
"step": 18180 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.470030292292834e-06, |
|
"loss": 1.7557, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.451055791129495e-06, |
|
"loss": 1.7505, |
|
"step": 18220 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.4321224852058145e-06, |
|
"loss": 1.767, |
|
"step": 18240 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.4132304936191686e-06, |
|
"loss": 1.7415, |
|
"step": 18260 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 3.3943799352070574e-06, |
|
"loss": 1.7687, |
|
"step": 18280 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.3755709285463468e-06, |
|
"loss": 1.754, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.3568035919525154e-06, |
|
"loss": 1.7422, |
|
"step": 18320 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 3.338078043478943e-06, |
|
"loss": 1.7568, |
|
"step": 18340 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.3193944009161326e-06, |
|
"loss": 1.7355, |
|
"step": 18360 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.300752781790987e-06, |
|
"loss": 1.7707, |
|
"step": 18380 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.282153303366068e-06, |
|
"loss": 1.7555, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.2635960826388546e-06, |
|
"loss": 1.7574, |
|
"step": 18420 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.245081236341011e-06, |
|
"loss": 1.7646, |
|
"step": 18440 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 3.226608880937653e-06, |
|
"loss": 1.7342, |
|
"step": 18460 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.2081791326266042e-06, |
|
"loss": 1.7688, |
|
"step": 18480 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.1897921073376936e-06, |
|
"loss": 1.7527, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 3.1714479207319826e-06, |
|
"loss": 1.7483, |
|
"step": 18520 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.1531466882010732e-06, |
|
"loss": 1.7643, |
|
"step": 18540 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.1348885248663785e-06, |
|
"loss": 1.7604, |
|
"step": 18560 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.1166735455783814e-06, |
|
"loss": 1.7573, |
|
"step": 18580 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.0985018649159137e-06, |
|
"loss": 1.7594, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.080373597185462e-06, |
|
"loss": 1.7518, |
|
"step": 18620 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.062288856420417e-06, |
|
"loss": 1.7487, |
|
"step": 18640 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 3.0442477563803708e-06, |
|
"loss": 1.7511, |
|
"step": 18660 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.0262504105504033e-06, |
|
"loss": 1.7604, |
|
"step": 18680 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 3.008296932140359e-06, |
|
"loss": 1.7601, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.9903874340841452e-06, |
|
"loss": 1.7708, |
|
"step": 18720 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.9725220290390157e-06, |
|
"loss": 1.7589, |
|
"step": 18740 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.954700829384857e-06, |
|
"loss": 1.7628, |
|
"step": 18760 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.9369239472235036e-06, |
|
"loss": 1.7582, |
|
"step": 18780 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.9191914943779963e-06, |
|
"loss": 1.7581, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.90150358239191e-06, |
|
"loss": 1.7553, |
|
"step": 18820 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 2.883860322528651e-06, |
|
"loss": 1.7518, |
|
"step": 18840 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.8662618257707266e-06, |
|
"loss": 1.7404, |
|
"step": 18860 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.848708202819078e-06, |
|
"loss": 1.7474, |
|
"step": 18880 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.8311995640923827e-06, |
|
"loss": 1.7523, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 2.813736019726342e-06, |
|
"loss": 1.7517, |
|
"step": 18920 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.7963176795729874e-06, |
|
"loss": 1.7367, |
|
"step": 18940 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 2.7789446532000208e-06, |
|
"loss": 1.7792, |
|
"step": 18960 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.761617049890091e-06, |
|
"loss": 1.7401, |
|
"step": 18980 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.7443349786401186e-06, |
|
"loss": 1.7603, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.7270985481606173e-06, |
|
"loss": 1.7672, |
|
"step": 19020 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.7099078668749957e-06, |
|
"loss": 1.7617, |
|
"step": 19040 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 2.6927630429188968e-06, |
|
"loss": 1.7595, |
|
"step": 19060 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.675664184139487e-06, |
|
"loss": 1.7514, |
|
"step": 19080 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.6586113980948024e-06, |
|
"loss": 1.7675, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.6416047920530775e-06, |
|
"loss": 1.7594, |
|
"step": 19120 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.6246444729920363e-06, |
|
"loss": 1.7382, |
|
"step": 19140 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.6077305475982496e-06, |
|
"loss": 1.7528, |
|
"step": 19160 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.5908631222664638e-06, |
|
"loss": 1.7441, |
|
"step": 19180 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.574042303098915e-06, |
|
"loss": 1.7557, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.557268195904662e-06, |
|
"loss": 1.7525, |
|
"step": 19220 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.540540906198945e-06, |
|
"loss": 1.7521, |
|
"step": 19240 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.5238605392024927e-06, |
|
"loss": 1.7364, |
|
"step": 19260 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.625, |
|
"learning_rate": 2.5072271998408792e-06, |
|
"loss": 1.745, |
|
"step": 19280 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 2.4906409927438546e-06, |
|
"loss": 1.7552, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.4741020222446867e-06, |
|
"loss": 1.7602, |
|
"step": 19320 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.4576103923795224e-06, |
|
"loss": 1.7483, |
|
"step": 19340 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.4411662068866983e-06, |
|
"loss": 1.7493, |
|
"step": 19360 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.424769569206118e-06, |
|
"loss": 1.7549, |
|
"step": 19380 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.4084205824786045e-06, |
|
"loss": 1.7592, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.3921193495452153e-06, |
|
"loss": 1.77, |
|
"step": 19420 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.375, |
|
"learning_rate": 2.3758659729466337e-06, |
|
"loss": 1.7447, |
|
"step": 19440 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.3596605549225115e-06, |
|
"loss": 1.7566, |
|
"step": 19460 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.343503197410818e-06, |
|
"loss": 1.7408, |
|
"step": 19480 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.3273940020471984e-06, |
|
"loss": 1.752, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.3113330701643546e-06, |
|
"loss": 1.7639, |
|
"step": 19520 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.2953205027913828e-06, |
|
"loss": 1.7446, |
|
"step": 19540 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.2793564006531555e-06, |
|
"loss": 1.748, |
|
"step": 19560 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.263440864169675e-06, |
|
"loss": 1.7488, |
|
"step": 19580 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.247573993455453e-06, |
|
"loss": 1.7564, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.2317558883188728e-06, |
|
"loss": 1.7446, |
|
"step": 19620 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.215986648261568e-06, |
|
"loss": 1.7711, |
|
"step": 19640 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.200266372477785e-06, |
|
"loss": 1.7519, |
|
"step": 19660 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.184595159853783e-06, |
|
"loss": 1.7383, |
|
"step": 19680 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.168973108967177e-06, |
|
"loss": 1.7579, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.153400318086347e-06, |
|
"loss": 1.7631, |
|
"step": 19720 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.137876885169813e-06, |
|
"loss": 1.7736, |
|
"step": 19740 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.1224029078656103e-06, |
|
"loss": 1.7371, |
|
"step": 19760 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.1069784835106744e-06, |
|
"loss": 1.7575, |
|
"step": 19780 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.0916037091302476e-06, |
|
"loss": 1.7436, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 2.0762786814372494e-06, |
|
"loss": 1.7645, |
|
"step": 19820 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.0610034968316727e-06, |
|
"loss": 1.761, |
|
"step": 19840 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.045778251399981e-06, |
|
"loss": 1.7396, |
|
"step": 19860 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.030603040914505e-06, |
|
"loss": 1.7514, |
|
"step": 19880 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.0154779608328334e-06, |
|
"loss": 1.7587, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.0004031062972175e-06, |
|
"loss": 1.7274, |
|
"step": 19920 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.9853785721339704e-06, |
|
"loss": 1.7504, |
|
"step": 19940 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.97040445285288e-06, |
|
"loss": 1.7723, |
|
"step": 19960 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.9554808426465944e-06, |
|
"loss": 1.7469, |
|
"step": 19980 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.9406078353900437e-06, |
|
"loss": 1.7564, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.9257855246398583e-06, |
|
"loss": 1.7386, |
|
"step": 20020 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.9110140036337578e-06, |
|
"loss": 1.7423, |
|
"step": 20040 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.896293365289973e-06, |
|
"loss": 1.7474, |
|
"step": 20060 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.8816237022066774e-06, |
|
"loss": 1.7591, |
|
"step": 20080 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.8670051066613826e-06, |
|
"loss": 1.767, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.8524376706103676e-06, |
|
"loss": 1.7405, |
|
"step": 20120 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.837921485688099e-06, |
|
"loss": 1.767, |
|
"step": 20140 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.8234566432066603e-06, |
|
"loss": 1.7601, |
|
"step": 20160 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.8090432341551655e-06, |
|
"loss": 1.7558, |
|
"step": 20180 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.7946813491991988e-06, |
|
"loss": 1.7534, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.7803710786802342e-06, |
|
"loss": 1.7425, |
|
"step": 20220 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.7661125126150825e-06, |
|
"loss": 1.7585, |
|
"step": 20240 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.7519057406952988e-06, |
|
"loss": 1.7622, |
|
"step": 20260 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.7377508522866448e-06, |
|
"loss": 1.7498, |
|
"step": 20280 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.7236479364285186e-06, |
|
"loss": 1.7554, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.7095970818333862e-06, |
|
"loss": 1.7445, |
|
"step": 20320 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.6955983768862238e-06, |
|
"loss": 1.7481, |
|
"step": 20340 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.681651909643982e-06, |
|
"loss": 1.7431, |
|
"step": 20360 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.6677577678350088e-06, |
|
"loss": 1.7245, |
|
"step": 20380 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.6539160388584996e-06, |
|
"loss": 1.7596, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.6401268097839696e-06, |
|
"loss": 1.7495, |
|
"step": 20420 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.6263901673506776e-06, |
|
"loss": 1.7529, |
|
"step": 20440 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.6127061979670988e-06, |
|
"loss": 1.7601, |
|
"step": 20460 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.599074987710375e-06, |
|
"loss": 1.7599, |
|
"step": 20480 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.5854966223257751e-06, |
|
"loss": 1.7434, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.57197118722615e-06, |
|
"loss": 1.748, |
|
"step": 20520 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.5584987674914064e-06, |
|
"loss": 1.773, |
|
"step": 20540 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.5450794478679575e-06, |
|
"loss": 1.7542, |
|
"step": 20560 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.531713312768207e-06, |
|
"loss": 1.7607, |
|
"step": 20580 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.518400446270003e-06, |
|
"loss": 1.7506, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.5051409321161081e-06, |
|
"loss": 1.7602, |
|
"step": 20620 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.4919348537136947e-06, |
|
"loss": 1.739, |
|
"step": 20640 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.4787822941337938e-06, |
|
"loss": 1.7506, |
|
"step": 20660 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4656833361107814e-06, |
|
"loss": 1.7391, |
|
"step": 20680 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.4526380620418712e-06, |
|
"loss": 1.7495, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.4396465539865767e-06, |
|
"loss": 1.7558, |
|
"step": 20720 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.4267088936662067e-06, |
|
"loss": 1.7498, |
|
"step": 20740 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.413825162463347e-06, |
|
"loss": 1.7571, |
|
"step": 20760 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.40099544142135e-06, |
|
"loss": 1.7423, |
|
"step": 20780 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.3882198112438261e-06, |
|
"loss": 1.7465, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.3754983522941313e-06, |
|
"loss": 1.7513, |
|
"step": 20820 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.3628311445948649e-06, |
|
"loss": 1.7469, |
|
"step": 20840 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.3502182678273757e-06, |
|
"loss": 1.7463, |
|
"step": 20860 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.3376598013312347e-06, |
|
"loss": 1.7517, |
|
"step": 20880 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.3251558241037644e-06, |
|
"loss": 1.7621, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.3127064147995283e-06, |
|
"loss": 1.7647, |
|
"step": 20920 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.3003116517298386e-06, |
|
"loss": 1.7427, |
|
"step": 20940 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.2879716128622522e-06, |
|
"loss": 1.7471, |
|
"step": 20960 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.2756863758201076e-06, |
|
"loss": 1.7534, |
|
"step": 20980 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.2634560178820076e-06, |
|
"loss": 1.7494, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.2512806159813506e-06, |
|
"loss": 1.7686, |
|
"step": 21020 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.2391602467058395e-06, |
|
"loss": 1.7479, |
|
"step": 21040 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.227094986297004e-06, |
|
"loss": 1.7466, |
|
"step": 21060 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.2150849106497176e-06, |
|
"loss": 1.743, |
|
"step": 21080 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.2031300953117208e-06, |
|
"loss": 1.7572, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 1.1912306154831488e-06, |
|
"loss": 1.7402, |
|
"step": 21120 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.1793865460160547e-06, |
|
"loss": 1.7405, |
|
"step": 21140 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.1675979614139422e-06, |
|
"loss": 1.7389, |
|
"step": 21160 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.1558649358312902e-06, |
|
"loss": 1.747, |
|
"step": 21180 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.1441875430730987e-06, |
|
"loss": 1.7453, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.1325658565944132e-06, |
|
"loss": 1.7474, |
|
"step": 21220 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.1209999494998603e-06, |
|
"loss": 1.7591, |
|
"step": 21240 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.1094898945432064e-06, |
|
"loss": 1.7588, |
|
"step": 21260 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.098035764126879e-06, |
|
"loss": 1.767, |
|
"step": 21280 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.0866376303015213e-06, |
|
"loss": 1.7465, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.0752955647655394e-06, |
|
"loss": 1.7664, |
|
"step": 21320 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.0640096388646471e-06, |
|
"loss": 1.7541, |
|
"step": 21340 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.0527799235914215e-06, |
|
"loss": 1.7368, |
|
"step": 21360 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.0416064895848555e-06, |
|
"loss": 1.7355, |
|
"step": 21380 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.0304894071299077e-06, |
|
"loss": 1.7654, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.0194287461570696e-06, |
|
"loss": 1.764, |
|
"step": 21420 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.0084245762419187e-06, |
|
"loss": 1.7576, |
|
"step": 21440 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.974769666046825e-07, |
|
"loss": 1.7699, |
|
"step": 21460 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.86585986109808e-07, |
|
"loss": 1.7636, |
|
"step": 21480 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.757517032655229e-07, |
|
"loss": 1.7467, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.649741862233974e-07, |
|
"loss": 1.7481, |
|
"step": 21520 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.542535027779388e-07, |
|
"loss": 1.7594, |
|
"step": 21540 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.435897203661392e-07, |
|
"loss": 1.7374, |
|
"step": 21560 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.329829060670681e-07, |
|
"loss": 1.7448, |
|
"step": 21580 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 9.224331266014419e-07, |
|
"loss": 1.758, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.119404483312089e-07, |
|
"loss": 1.7526, |
|
"step": 21620 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.01504937259129e-07, |
|
"loss": 1.7583, |
|
"step": 21640 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.911266590283607e-07, |
|
"loss": 1.7607, |
|
"step": 21660 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.808056789220465e-07, |
|
"loss": 1.7445, |
|
"step": 21680 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.705420618629035e-07, |
|
"loss": 1.7389, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.75, |
|
"learning_rate": 8.60335872412813e-07, |
|
"loss": 1.744, |
|
"step": 21720 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.501871747724177e-07, |
|
"loss": 1.7643, |
|
"step": 21740 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 8.400960327807128e-07, |
|
"loss": 1.7617, |
|
"step": 21760 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.300625099146542e-07, |
|
"loss": 1.7346, |
|
"step": 21780 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 8.200866692887421e-07, |
|
"loss": 1.7582, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.101685736546438e-07, |
|
"loss": 1.7452, |
|
"step": 21820 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.375, |
|
"learning_rate": 8.00308285400786e-07, |
|
"loss": 1.7564, |
|
"step": 21840 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.90505866551966e-07, |
|
"loss": 1.7482, |
|
"step": 21860 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.5, |
|
"learning_rate": 7.807613787689616e-07, |
|
"loss": 1.7561, |
|
"step": 21880 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.710748833481418e-07, |
|
"loss": 1.7453, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 7.614464412210854e-07, |
|
"loss": 1.7423, |
|
"step": 21920 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 7.518761129541929e-07, |
|
"loss": 1.7607, |
|
"step": 21940 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.423639587483078e-07, |
|
"loss": 1.7561, |
|
"step": 21960 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.329100384383381e-07, |
|
"loss": 1.7595, |
|
"step": 21980 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.235144114928782e-07, |
|
"loss": 1.7509, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.141771370138383e-07, |
|
"loss": 1.7335, |
|
"step": 22020 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.5, |
|
"learning_rate": 7.048982737360677e-07, |
|
"loss": 1.7396, |
|
"step": 22040 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 6.956778800269914e-07, |
|
"loss": 1.7418, |
|
"step": 22060 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 6.865160138862348e-07, |
|
"loss": 1.7491, |
|
"step": 22080 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 6.774127329452684e-07, |
|
"loss": 1.7625, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 6.683680944670401e-07, |
|
"loss": 1.7401, |
|
"step": 22120 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 6.59382155345607e-07, |
|
"loss": 1.7504, |
|
"step": 22140 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 6.504549721057996e-07, |
|
"loss": 1.7243, |
|
"step": 22160 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 6.415866009028426e-07, |
|
"loss": 1.7606, |
|
"step": 22180 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 6.327770975220149e-07, |
|
"loss": 1.7543, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 6.240265173782955e-07, |
|
"loss": 1.7501, |
|
"step": 22220 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 6.153349155160137e-07, |
|
"loss": 1.7418, |
|
"step": 22240 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 6.067023466085054e-07, |
|
"loss": 1.7562, |
|
"step": 22260 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.625, |
|
"learning_rate": 5.981288649577665e-07, |
|
"loss": 1.7529, |
|
"step": 22280 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.896145244941132e-07, |
|
"loss": 1.7343, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.811593787758407e-07, |
|
"loss": 1.7588, |
|
"step": 22320 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.727634809888937e-07, |
|
"loss": 1.7598, |
|
"step": 22340 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 5.644268839465162e-07, |
|
"loss": 1.7461, |
|
"step": 22360 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.5, |
|
"learning_rate": 5.561496400889344e-07, |
|
"loss": 1.7623, |
|
"step": 22380 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.479318014830248e-07, |
|
"loss": 1.7425, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 5.397734198219751e-07, |
|
"loss": 1.7585, |
|
"step": 22420 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.316745464249739e-07, |
|
"loss": 1.7674, |
|
"step": 22440 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 5.236352322368798e-07, |
|
"loss": 1.7477, |
|
"step": 22460 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 5.156555278278997e-07, |
|
"loss": 1.7478, |
|
"step": 22480 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.077354833932746e-07, |
|
"loss": 1.7396, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 4.99875148752964e-07, |
|
"loss": 1.7386, |
|
"step": 22520 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.920745733513311e-07, |
|
"loss": 1.7596, |
|
"step": 22540 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 4.843338062568293e-07, |
|
"loss": 1.7637, |
|
"step": 22560 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 4.7665289616169673e-07, |
|
"loss": 1.7412, |
|
"step": 22580 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.690318913816505e-07, |
|
"loss": 1.7503, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.6147083985558336e-07, |
|
"loss": 1.7382, |
|
"step": 22620 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.5396978914525436e-07, |
|
"loss": 1.7509, |
|
"step": 22640 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 4.4652878643499986e-07, |
|
"loss": 1.7474, |
|
"step": 22660 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.3914787853143513e-07, |
|
"loss": 1.7648, |
|
"step": 22680 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.3182711186314894e-07, |
|
"loss": 1.7397, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 4.245665324804282e-07, |
|
"loss": 1.7452, |
|
"step": 22720 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.1736618605495605e-07, |
|
"loss": 1.7494, |
|
"step": 22740 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.102261178795286e-07, |
|
"loss": 1.7637, |
|
"step": 22760 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.031463728677687e-07, |
|
"loss": 1.7536, |
|
"step": 22780 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.9612699555384826e-07, |
|
"loss": 1.7383, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.8916803009220074e-07, |
|
"loss": 1.744, |
|
"step": 22820 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.8226952025724904e-07, |
|
"loss": 1.761, |
|
"step": 22840 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.7543150944312713e-07, |
|
"loss": 1.7548, |
|
"step": 22860 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.686540406634098e-07, |
|
"loss": 1.7583, |
|
"step": 22880 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.6193715655083784e-07, |
|
"loss": 1.7409, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.552808993570556e-07, |
|
"loss": 1.7385, |
|
"step": 22920 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.4868531095233806e-07, |
|
"loss": 1.7496, |
|
"step": 22940 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.421504328253378e-07, |
|
"loss": 1.7553, |
|
"step": 22960 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.3567630608280943e-07, |
|
"loss": 1.7509, |
|
"step": 22980 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 3.292629714493645e-07, |
|
"loss": 1.7299, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 3.2291046926721134e-07, |
|
"loss": 1.7684, |
|
"step": 23020 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 3.166188394958969e-07, |
|
"loss": 1.7697, |
|
"step": 23040 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.1038812171205965e-07, |
|
"loss": 1.7504, |
|
"step": 23060 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.0421835510917706e-07, |
|
"loss": 1.7534, |
|
"step": 23080 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.981095784973276e-07, |
|
"loss": 1.7422, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 2.9206183030293324e-07, |
|
"loss": 1.7543, |
|
"step": 23120 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.860751485685309e-07, |
|
"loss": 1.744, |
|
"step": 23140 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.801495709525237e-07, |
|
"loss": 1.7581, |
|
"step": 23160 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.742851347289488e-07, |
|
"loss": 1.7486, |
|
"step": 23180 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.684818767872421e-07, |
|
"loss": 1.7472, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.627398336320053e-07, |
|
"loss": 1.743, |
|
"step": 23220 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.570590413827789e-07, |
|
"loss": 1.7496, |
|
"step": 23240 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.5143953577380974e-07, |
|
"loss": 1.739, |
|
"step": 23260 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.4588135215382834e-07, |
|
"loss": 1.752, |
|
"step": 23280 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 2.40384525485835e-07, |
|
"loss": 1.7487, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.3494909034686542e-07, |
|
"loss": 1.7627, |
|
"step": 23320 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.2957508092777969e-07, |
|
"loss": 1.7621, |
|
"step": 23340 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.2426253103305485e-07, |
|
"loss": 1.7497, |
|
"step": 23360 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.1901147408055935e-07, |
|
"loss": 1.7488, |
|
"step": 23380 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.1382194310134884e-07, |
|
"loss": 1.7582, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 2.0869397073946196e-07, |
|
"loss": 1.7587, |
|
"step": 23420 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 2.03627589251707e-07, |
|
"loss": 1.7508, |
|
"step": 23440 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.9862283050746555e-07, |
|
"loss": 1.7634, |
|
"step": 23460 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.936797259884904e-07, |
|
"loss": 1.7472, |
|
"step": 23480 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.8879830678870448e-07, |
|
"loss": 1.7404, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.839786036140101e-07, |
|
"loss": 1.7505, |
|
"step": 23520 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.7922064678209228e-07, |
|
"loss": 1.7502, |
|
"step": 23540 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.7452446622222675e-07, |
|
"loss": 1.7488, |
|
"step": 23560 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.6989009147509893e-07, |
|
"loss": 1.7317, |
|
"step": 23580 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.6531755169261088e-07, |
|
"loss": 1.7575, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.6080687563769793e-07, |
|
"loss": 1.7472, |
|
"step": 23620 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.563580916841534e-07, |
|
"loss": 1.746, |
|
"step": 23640 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.5197122781644424e-07, |
|
"loss": 1.7473, |
|
"step": 23660 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.4764631162954124e-07, |
|
"loss": 1.7494, |
|
"step": 23680 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.4338337032873685e-07, |
|
"loss": 1.722, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.3918243072948312e-07, |
|
"loss": 1.7425, |
|
"step": 23720 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.3504351925721638e-07, |
|
"loss": 1.7323, |
|
"step": 23740 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.3096666194719388e-07, |
|
"loss": 1.7556, |
|
"step": 23760 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.269518844443307e-07, |
|
"loss": 1.7377, |
|
"step": 23780 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.2299921200303876e-07, |
|
"loss": 1.7489, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 1.1910866948706024e-07, |
|
"loss": 1.75, |
|
"step": 23820 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.1528028136932435e-07, |
|
"loss": 1.7533, |
|
"step": 23840 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.115140717317853e-07, |
|
"loss": 1.739, |
|
"step": 23860 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.0781006426526797e-07, |
|
"loss": 1.736, |
|
"step": 23880 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.0416828226932684e-07, |
|
"loss": 1.7565, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.0058874865209512e-07, |
|
"loss": 1.7586, |
|
"step": 23920 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.707148593014027e-08, |
|
"loss": 1.7549, |
|
"step": 23940 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.361651622832202e-08, |
|
"loss": 1.7485, |
|
"step": 23960 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.022386127965799e-08, |
|
"loss": 1.7582, |
|
"step": 23980 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.689354242517933e-08, |
|
"loss": 1.748, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.3625580613802e-08, |
|
"loss": 1.7587, |
|
"step": 24020 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.041999640219566e-08, |
|
"loss": 1.7427, |
|
"step": 24040 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 7.727680995464726e-08, |
|
"loss": 1.7628, |
|
"step": 24060 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.5, |
|
"learning_rate": 7.419604104294542e-08, |
|
"loss": 1.7293, |
|
"step": 24080 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 7.1177709046244e-08, |
|
"loss": 1.7393, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 6.822183295094986e-08, |
|
"loss": 1.7567, |
|
"step": 24120 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 6.532843135059751e-08, |
|
"loss": 1.7445, |
|
"step": 24140 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 6.24975224457347e-08, |
|
"loss": 1.7648, |
|
"step": 24160 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 5.972912404380805e-08, |
|
"loss": 1.7479, |
|
"step": 24180 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.70232535590487e-08, |
|
"loss": 1.7474, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 5.437992801236802e-08, |
|
"loss": 1.7522, |
|
"step": 24220 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.179916403124097e-08, |
|
"loss": 1.7673, |
|
"step": 24240 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 4.928097784961394e-08, |
|
"loss": 1.7651, |
|
"step": 24260 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.682538530779268e-08, |
|
"loss": 1.7342, |
|
"step": 24280 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.4432401852346765e-08, |
|
"loss": 1.7453, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.2102042536011914e-08, |
|
"loss": 1.7627, |
|
"step": 24320 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.983432201759563e-08, |
|
"loss": 1.7569, |
|
"step": 24340 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.762925456188393e-08, |
|
"loss": 1.738, |
|
"step": 24360 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.5486854039552546e-08, |
|
"loss": 1.7552, |
|
"step": 24380 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 3.340713392708028e-08, |
|
"loss": 1.7822, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 3.139010730666248e-08, |
|
"loss": 1.7597, |
|
"step": 24420 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.9435786866128803e-08, |
|
"loss": 1.7347, |
|
"step": 24440 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.7544184898865568e-08, |
|
"loss": 1.7423, |
|
"step": 24460 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 2.5715313303737997e-08, |
|
"loss": 1.7643, |
|
"step": 24480 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.3949183585011415e-08, |
|
"loss": 1.7526, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 2.2245806852285723e-08, |
|
"loss": 1.7543, |
|
"step": 24520 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 2.0605193820417703e-08, |
|
"loss": 1.7374, |
|
"step": 24540 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.9027354809461053e-08, |
|
"loss": 1.7546, |
|
"step": 24560 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.751229974459645e-08, |
|
"loss": 1.7331, |
|
"step": 24580 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.60600381560716e-08, |
|
"loss": 1.7638, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.4670579179137945e-08, |
|
"loss": 1.7468, |
|
"step": 24620 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.3343931553999601e-08, |
|
"loss": 1.7356, |
|
"step": 24640 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.2080103625751183e-08, |
|
"loss": 1.7594, |
|
"step": 24660 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.0879103344328956e-08, |
|
"loss": 1.746, |
|
"step": 24680 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.740938264463096e-09, |
|
"loss": 1.739, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.665615545625505e-09, |
|
"loss": 1.7597, |
|
"step": 24720 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 7.65314195198541e-09, |
|
"loss": 1.7667, |
|
"step": 24740 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 6.7035238523716075e-09, |
|
"loss": 1.7564, |
|
"step": 24760 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.8167672202269486e-09, |
|
"loss": 1.7443, |
|
"step": 24780 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 4.992877633570592e-09, |
|
"loss": 1.7579, |
|
"step": 24800 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 25052, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 200, |
|
"total_flos": 3.850482409609259e+19, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|